Skip to content

Base

Base class for digitization methods that handles conversion of various input formats into standardized digital representations.

DigitizationBase

Bases: ABC

Abstract base class for digitization agents that defines the common interface and functionality for all digitization implementations.

Attributes:

Name Type Description
name str

Unique identifier for the digitization agent

supported_formats List[str]

List of input formats this agent can process

The output will be export to - markdown - json for table - json for images and files

Source code in Docs2KG/digitization/base.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class DigitizationBase(ABC):
    """
    Abstract base class for digitization agents that defines the common interface
    and functionality for all digitization implementations.

    Attributes:
        name (str): Unique identifier for the digitization agent
        supported_formats (List[str]): List of input formats this agent can process

    The output will be export to
    - markdown
    - json for table
    - json for images and files
    """

    def __init__(
        self,
        file_path: Path,
        supported_formats: Optional[List[str]] = None,
    ):
        self.file_path = file_path
        self.filename = file_path.stem
        self.name = self.__class__.__name__
        self.supported_formats = supported_formats or []

    @property
    def output_dir(self) -> Path:
        """
        Get the output directory for the digitization agent.

        Returns:
            str: Output directory path
        """
        output_dir_path = PROJECT_CONFIG.data.output_dir
        # based on the filename, we will create a folder
        output_dir = Path(output_dir_path) / self.filename / self.name
        output_dir.mkdir(parents=True, exist_ok=True)

        # create a sub folder for images
        images_dir = output_dir / "images"
        images_dir.mkdir(parents=True, exist_ok=True)
        return output_dir

    @abstractmethod
    def process(self, input_data: Any) -> Union[Dict, Any]:
        """
        Process the input data and return digitized output.

        Args:
            input_data: The data to be digitized

        Returns:
            Digitized representation of the input data

        Raises:
            NotImplementedError: If the child class doesn't implement this method
            ValueError: If input format is not supported
        """
        raise NotImplementedError("Each digitization agent must implement process()")

    def export_content_to_markdown_file(self, text: str) -> Path:
        with open(self.output_dir / f"{self.filename}.md", "w") as f:
            f.write(text)

        return self.output_dir / f"{self.filename}.md"

    def export_table_to_json_file(self, data: Dict) -> Path:
        with open(self.output_dir / f"{self.filename}_table.json", "w") as f:
            f.write(json.dumps(data, indent=4))

        return self.output_dir / f"{self.filename}_table.json"

    def export_images_to_json_file(self, data: Dict) -> Path:
        with open(self.output_dir / f"{self.filename}_images.json", "w") as f:
            f.write(json.dumps(data, indent=4))

        return self.output_dir / f"{self.filename}_images.json"

    def validate_input(self, input_data: Any) -> bool:
        """
        Validate if the input data format is supported by this agent.

        Args:
            input_data: The data to validate

        Returns:
            bool: True if input format is supported, False otherwise
        """
        return True  # Base implementation accepts all formats

    def get_agent_info(self) -> Dict[str, Any]:
        """
        Get information about the digitization agent.

        Returns:
            Dict containing agent metadata and configuration
        """
        return {
            "name": self.__class__.__name__,
            "supported_formats": self.supported_formats,
        }

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(" f"supported_formats={self.supported_formats})"
        )

    def __str__(self) -> str:
        return f"{self.name} Digitization Agent"

output_dir property

Get the output directory for the digitization agent.

Returns:

Name Type Description
str Path

Output directory path

get_agent_info()

Get information about the digitization agent.

Returns:

Type Description
Dict[str, Any]

Dict containing agent metadata and configuration

Source code in Docs2KG/digitization/base.py
104
105
106
107
108
109
110
111
112
113
114
def get_agent_info(self) -> Dict[str, Any]:
    """
    Get information about the digitization agent.

    Returns:
        Dict containing agent metadata and configuration
    """
    return {
        "name": self.__class__.__name__,
        "supported_formats": self.supported_formats,
    }

process(input_data) abstractmethod

Process the input data and return digitized output.

Parameters:

Name Type Description Default
input_data Any

The data to be digitized

required

Returns:

Type Description
Union[Dict, Any]

Digitized representation of the input data

Raises:

Type Description
NotImplementedError

If the child class doesn't implement this method

ValueError

If input format is not supported

Source code in Docs2KG/digitization/base.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@abstractmethod
def process(self, input_data: Any) -> Union[Dict, Any]:
    """
    Process the input data and return digitized output.

    Args:
        input_data: The data to be digitized

    Returns:
        Digitized representation of the input data

    Raises:
        NotImplementedError: If the child class doesn't implement this method
        ValueError: If input format is not supported
    """
    raise NotImplementedError("Each digitization agent must implement process()")

validate_input(input_data)

Validate if the input data format is supported by this agent.

Parameters:

Name Type Description Default
input_data Any

The data to validate

required

Returns:

Name Type Description
bool bool

True if input format is supported, False otherwise

Source code in Docs2KG/digitization/base.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def validate_input(self, input_data: Any) -> bool:
    """
    Validate if the input data format is supported by this agent.

    Args:
        input_data: The data to validate

    Returns:
        bool: True if input format is supported, False otherwise
    """
    return True  # Base implementation accepts all formats