Bases: DigitizationBase
Enhanced PDFDocling class with separate exports for markdown, images, and tables.
Source code in Docs2KG/digitization/image/pdf_docling.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80 | class PDFDocling(DigitizationBase):
"""
Enhanced PDFDocling class with separate exports for markdown, images, and tables.
"""
def __init__(self, file_path: Path):
super().__init__(file_path=file_path, supported_formats=[InputFormat.PDF])
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
self.converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
@staticmethod
def validate_input(input_data: Union[str, Path]) -> bool:
try:
if isinstance(input_data, str) and input_data.startswith(
("http://", "https://")
):
return input_data.lower().endswith(".pdf")
path = Path(input_data)
return path.exists() and path.suffix.lower() == ".pdf"
except Exception as e:
logger.exception(f"Error validating input: {str(e)}")
return False
def export_markdown(self, document) -> Path:
"""Export document content to markdown file."""
markdown_path = self.output_dir / f"{self.filename}.md"
document.save_as_markdown(
markdown_path,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=self.output_dir / "images",
)
return markdown_path
def process(self) -> Dict[str, Any]:
"""
Process PDF document and generate all outputs.
"""
if not self.validate_input(self.file_path):
raise ValueError(
f"Invalid input: {self.file_path}. Expected valid PDF file path or URL"
)
try:
# Convert the document
result = self.converter.convert(str(self.file_path))
# Generate all outputs
markdown_path = self.export_markdown(result.document)
return markdown_path
except FileNotFoundError:
raise FileNotFoundError(f"PDF source not found: {self.file_path}")
except Exception as e:
raise Exception(f"Error processing PDF: {str(e)}")
def __repr__(self) -> str:
return f"PDFDocling(file_path='{self.file_path}')"
|
export_markdown(document)
Export document content to markdown file.
Source code in Docs2KG/digitization/image/pdf_docling.py
47
48
49
50
51
52
53
54
55 | def export_markdown(self, document) -> Path:
"""Export document content to markdown file."""
markdown_path = self.output_dir / f"{self.filename}.md"
document.save_as_markdown(
markdown_path,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=self.output_dir / "images",
)
return markdown_path
|
process()
Process PDF document and generate all outputs.
Source code in Docs2KG/digitization/image/pdf_docling.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | def process(self) -> Dict[str, Any]:
"""
Process PDF document and generate all outputs.
"""
if not self.validate_input(self.file_path):
raise ValueError(
f"Invalid input: {self.file_path}. Expected valid PDF file path or URL"
)
try:
# Convert the document
result = self.converter.convert(str(self.file_path))
# Generate all outputs
markdown_path = self.export_markdown(result.document)
return markdown_path
except FileNotFoundError:
raise FileNotFoundError(f"PDF source not found: {self.file_path}")
except Exception as e:
raise Exception(f"Error processing PDF: {str(e)}")
|