Skip to content

Word docling

DOCXMammoth

Bases: DigitizationBase

DOCXDocling class for processing Word documents using mammoth.

Source code in Docs2KG/digitization/native/word_docling.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class DOCXMammoth(DigitizationBase):
    """
    DOCXDocling class for processing Word documents using mammoth.
    """

    def __init__(self, file_path: Path):
        super().__init__(file_path=file_path, supported_formats=["docx"])

    @staticmethod
    def validate_input(input_data: Union[str, Path]) -> bool:
        """
        Validate if the input is a valid DOCX file path.

        Args:
            input_data: Path to DOCX file (string or Path object)

        Returns:
            bool: True if input is valid DOCX file, False otherwise
        """
        try:
            path = Path(input_data)
            return path.exists() and path.suffix.lower() == ".docx"
        except Exception as e:
            logger.exception(f"Error validating input: {str(e)}")
            return False

    def export_markdown(self, content: str) -> Path:
        """
        Export content to markdown file.

        Args:
            content: The markdown content to export

        Returns:
            Path: Path to the generated markdown file
        """
        markdown_path = self.output_dir / f"{self.filename}.md"
        markdown_path.write_text(content, encoding="utf-8")
        return markdown_path

    def process(self) -> Path:
        """
        Process DOCX document and generate markdown output.

        Returns:
            Path: Path to the generated markdown file

        Raises:
            ValueError: If input is not a valid DOCX file
            FileNotFoundError: If DOCX file doesn't exist
        """
        if not self.validate_input(self.file_path):
            raise ValueError(
                f"Invalid input: {self.file_path}. Expected valid DOCX file"
            )

        try:
            # Convert DOCX to markdown using mammoth
            with open(self.file_path, "rb") as docx_file:
                result = mammoth.convert_to_markdown(docx_file)
                markdown_content = result.value

                # Log any conversion messages
                if result.messages:
                    for message in result.messages:
                        logger.info(f"Conversion message: {message}")

            # Save markdown content to file
            markdown_path = self.export_markdown(markdown_content)
            return markdown_path

        except FileNotFoundError:
            raise FileNotFoundError(f"DOCX file not found: {self.file_path}")
        except Exception as e:
            raise Exception(f"Error processing DOCX: {str(e)}")

    def __repr__(self) -> str:
        return f"DOCXDocling(file_path='{self.file_path}')"

export_markdown(content)

Export content to markdown file.

Parameters:

Name Type Description Default
content str

The markdown content to export

required

Returns:

Name Type Description
Path Path

Path to the generated markdown file

Source code in Docs2KG/digitization/native/word_docling.py
37
38
39
40
41
42
43
44
45
46
47
48
49
def export_markdown(self, content: str) -> Path:
    """
    Export content to markdown file.

    Args:
        content: The markdown content to export

    Returns:
        Path: Path to the generated markdown file
    """
    markdown_path = self.output_dir / f"{self.filename}.md"
    markdown_path.write_text(content, encoding="utf-8")
    return markdown_path

process()

Process DOCX document and generate markdown output.

Returns:

Name Type Description
Path Path

Path to the generated markdown file

Raises:

Type Description
ValueError

If input is not a valid DOCX file

FileNotFoundError

If DOCX file doesn't exist

Source code in Docs2KG/digitization/native/word_docling.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def process(self) -> Path:
    """
    Process DOCX document and generate markdown output.

    Returns:
        Path: Path to the generated markdown file

    Raises:
        ValueError: If input is not a valid DOCX file
        FileNotFoundError: If DOCX file doesn't exist
    """
    if not self.validate_input(self.file_path):
        raise ValueError(
            f"Invalid input: {self.file_path}. Expected valid DOCX file"
        )

    try:
        # Convert DOCX to markdown using mammoth
        with open(self.file_path, "rb") as docx_file:
            result = mammoth.convert_to_markdown(docx_file)
            markdown_content = result.value

            # Log any conversion messages
            if result.messages:
                for message in result.messages:
                    logger.info(f"Conversion message: {message}")

        # Save markdown content to file
        markdown_path = self.export_markdown(markdown_content)
        return markdown_path

    except FileNotFoundError:
        raise FileNotFoundError(f"DOCX file not found: {self.file_path}")
    except Exception as e:
        raise Exception(f"Error processing DOCX: {str(e)}")

validate_input(input_data) staticmethod

Validate if the input is a valid DOCX file path.

Parameters:

Name Type Description Default
input_data Union[str, Path]

Path to DOCX file (string or Path object)

required

Returns:

Name Type Description
bool bool

True if input is valid DOCX file, False otherwise

Source code in Docs2KG/digitization/native/word_docling.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
@staticmethod
def validate_input(input_data: Union[str, Path]) -> bool:
    """
    Validate if the input is a valid DOCX file path.

    Args:
        input_data: Path to DOCX file (string or Path object)

    Returns:
        bool: True if input is valid DOCX file, False otherwise
    """
    try:
        path = Path(input_data)
        return path.exists() and path.suffix.lower() == ".docx"
    except Exception as e:
        logger.exception(f"Error validating input: {str(e)}")
        return False