Skip to content

Ebook

Chapter dataclass

Data class for storing chapter information

Source code in Docs2KG/digitization/native/ebook.py
15
16
17
18
19
20
21
22
@dataclass
class Chapter:
    """Data class for storing chapter information"""

    title: str
    content: str
    order: int
    images: List[Dict[str, str]]

EPUBDigitization

Bases: DigitizationBase

EPUB digitization agent that converts EPUB files to markdown, extracts images, and generates structured data.

Inherits from DigitizationBase and implements EPUB-specific processing logic.

Source code in Docs2KG/digitization/native/ebook.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class EPUBDigitization(DigitizationBase):
    """
    EPUB digitization agent that converts EPUB files to markdown, extracts images,
    and generates structured data.

    Inherits from DigitizationBase and implements EPUB-specific processing logic.
    """

    def __init__(self, file_path: Path):
        super().__init__(file_path=file_path, supported_formats=["epub"])
        self.book = None
        self.chapters: List[Chapter] = []
        self.metadata: Dict[str, Any] = {}
        self.image_counter = 0

        # Configure HTML to Text converter
        self.text_maker = html2text.HTML2Text()
        self.text_maker.ignore_links = False
        self.text_maker.ignore_images = False
        self.text_maker.body_width = 0

    def validate_input(self, input_data: Union[str, Path]) -> bool:
        """
        Validate if the input is a valid EPUB file.

        Args:
            input_data: File path to validate

        Returns:
            bool: True if input is valid EPUB, False otherwise
        """
        try:
            path = Path(input_data)
            if not path.exists():
                logger.error(f"File not found: {path}")
                return False

            if path.suffix.lower() != ".epub":
                logger.error(f"Invalid file format: {path.suffix}")
                return False

            # Try to load the EPUB file
            epub.read_epub(str(path))
            return True

        except Exception as e:
            logger.error(f"Error validating EPUB: {str(e)}")
            return False

    def extract_images_from_chapter(
        self, chapter_content: str, chapter_id: str
    ) -> List[Dict[str, str]]:
        """
        Extract and save images from chapter content.

        Args:
            chapter_content: HTML content of the chapter
            chapter_id: Identifier for the chapter

        Returns:
            List of dictionaries containing image information
        """
        images = []
        soup = BeautifulSoup(chapter_content, "html.parser")

        for img in soup.find_all("img"):
            try:
                img_src = img.get("src", "")
                if not img_src:
                    continue

                self.image_counter += 1
                img_filename = f"image_{chapter_id}_{self.image_counter}.jpg"
                img_path = self.output_dir / "images" / img_filename

                for item in self.book.get_items_of_type(ebooklib.ITEM_IMAGE):
                    if item.file_name.endswith(img_src.split("/")[-1]):
                        with open(img_path, "wb") as f:
                            f.write(item.content)

                        images.append(
                            {
                                "original_src": img_src,
                                "saved_path": str(
                                    img_path.relative_to(self.output_dir)
                                ),
                                "alt_text": img.get("alt", ""),
                                "chapter_id": chapter_id,
                            }
                        )

                        img["src"] = str(img_path.relative_to(self.output_dir))

            except Exception as e:
                logger.error(f"Error processing image: {str(e)}")

        return images

    def process_chapter(self, chapter_item, order: int) -> Optional[Chapter]:
        """
        Process a single chapter from the EPUB.

        Args:
            chapter_item: EpubItem containing chapter content
            order: Chapter order number

        Returns:
            Chapter object or None if processing fails
        """
        try:
            content = chapter_item.get_content().decode("utf-8")
            soup = BeautifulSoup(content, "html.parser")

            title = soup.find("title")
            title = title.text if title else f"Chapter {order}"

            images = self.extract_images_from_chapter(content, f"ch{order}")
            markdown_content = self.text_maker.handle(str(soup))

            return Chapter(
                title=title, content=markdown_content, order=order, images=images
            )

        except Exception as e:
            logger.error(f"Error processing chapter: {str(e)}")
            return None

    def extract_metadata(self):
        """Extract metadata from the EPUB file."""
        try:
            self.metadata = {
                "title": self.book.get_metadata("DC", "title"),
                "creator": self.book.get_metadata("DC", "creator"),
                "language": self.book.get_metadata("DC", "language"),
                "publisher": self.book.get_metadata("DC", "publisher"),
                "identifier": self.book.get_metadata("DC", "identifier"),
                "date": self.book.get_metadata("DC", "date"),
            }

            # Clean up metadata values
            for key, value in self.metadata.items():
                if isinstance(value, list) and value:
                    self.metadata[key] = value[0][0]
                elif not value:
                    self.metadata[key] = None

        except Exception as e:
            logger.error(f"Error extracting metadata: {str(e)}")

    def process(self, input_data: Optional[Any] = None) -> Dict[str, Any]:
        """
        Process the EPUB file and generate all outputs.

        Args:
            input_data: Optional additional input data (not used in this implementation)

        Returns:
            Dictionary containing paths to generated outputs
        """
        if not self.validate_input(self.file_path):
            raise ValueError(f"Invalid EPUB file: {self.file_path}")

        try:
            # Read the EPUB file
            self.book = epub.read_epub(str(self.file_path))

            # Extract metadata
            self.extract_metadata()

            # Process chapters
            all_images = []
            markdown_content = []

            # Add metadata section
            markdown_content.append("---")
            for key, value in self.metadata.items():
                if value:
                    markdown_content.append(f"{key}: {value}")
            markdown_content.append("---\n")

            # Process chapters and collect content
            for idx, item in enumerate(
                self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
            ):
                chapter = self.process_chapter(item, idx)
                if chapter:
                    self.chapters.append(chapter)
                    markdown_content.extend(
                        [f"# {chapter.title}\n", chapter.content, "---\n"]
                    )
                    all_images.extend(chapter.images)

            # Export markdown content
            markdown_path = self.export_content_to_markdown_file(
                "\n".join(markdown_content)
            )

            # Export images data
            images_data = {"total_images": len(all_images), "images": all_images}
            images_json_path = self.export_images_to_json_file(images_data)

            # Export basic structure data as table
            structure_data = {
                "metadata": self.metadata,
                "chapters": [
                    {
                        "title": ch.title,
                        "order": ch.order,
                        "image_count": len(ch.images),
                    }
                    for ch in self.chapters
                ],
            }
            table_json_path = self.export_table_to_json_file(structure_data)

            return {
                "markdown_path": markdown_path,
                "images_json_path": images_json_path,
                "table_json_path": table_json_path,
                "total_chapters": len(self.chapters),
                "total_images": len(all_images),
            }

        except Exception as e:
            logger.error(f"Error processing EPUB: {str(e)}")
            raise

extract_images_from_chapter(chapter_content, chapter_id)

Extract and save images from chapter content.

Parameters:

Name Type Description Default
chapter_content str

HTML content of the chapter

required
chapter_id str

Identifier for the chapter

required

Returns:

Type Description
List[Dict[str, str]]

List of dictionaries containing image information

Source code in Docs2KG/digitization/native/ebook.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def extract_images_from_chapter(
    self, chapter_content: str, chapter_id: str
) -> List[Dict[str, str]]:
    """
    Extract and save images from chapter content.

    Args:
        chapter_content: HTML content of the chapter
        chapter_id: Identifier for the chapter

    Returns:
        List of dictionaries containing image information
    """
    images = []
    soup = BeautifulSoup(chapter_content, "html.parser")

    for img in soup.find_all("img"):
        try:
            img_src = img.get("src", "")
            if not img_src:
                continue

            self.image_counter += 1
            img_filename = f"image_{chapter_id}_{self.image_counter}.jpg"
            img_path = self.output_dir / "images" / img_filename

            for item in self.book.get_items_of_type(ebooklib.ITEM_IMAGE):
                if item.file_name.endswith(img_src.split("/")[-1]):
                    with open(img_path, "wb") as f:
                        f.write(item.content)

                    images.append(
                        {
                            "original_src": img_src,
                            "saved_path": str(
                                img_path.relative_to(self.output_dir)
                            ),
                            "alt_text": img.get("alt", ""),
                            "chapter_id": chapter_id,
                        }
                    )

                    img["src"] = str(img_path.relative_to(self.output_dir))

        except Exception as e:
            logger.error(f"Error processing image: {str(e)}")

    return images

extract_metadata()

Extract metadata from the EPUB file.

Source code in Docs2KG/digitization/native/ebook.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def extract_metadata(self):
    """Extract metadata from the EPUB file."""
    try:
        self.metadata = {
            "title": self.book.get_metadata("DC", "title"),
            "creator": self.book.get_metadata("DC", "creator"),
            "language": self.book.get_metadata("DC", "language"),
            "publisher": self.book.get_metadata("DC", "publisher"),
            "identifier": self.book.get_metadata("DC", "identifier"),
            "date": self.book.get_metadata("DC", "date"),
        }

        # Clean up metadata values
        for key, value in self.metadata.items():
            if isinstance(value, list) and value:
                self.metadata[key] = value[0][0]
            elif not value:
                self.metadata[key] = None

    except Exception as e:
        logger.error(f"Error extracting metadata: {str(e)}")

process(input_data=None)

Process the EPUB file and generate all outputs.

Parameters:

Name Type Description Default
input_data Optional[Any]

Optional additional input data (not used in this implementation)

None

Returns:

Type Description
Dict[str, Any]

Dictionary containing paths to generated outputs

Source code in Docs2KG/digitization/native/ebook.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def process(self, input_data: Optional[Any] = None) -> Dict[str, Any]:
    """
    Process the EPUB file and generate all outputs.

    Args:
        input_data: Optional additional input data (not used in this implementation)

    Returns:
        Dictionary containing paths to generated outputs
    """
    if not self.validate_input(self.file_path):
        raise ValueError(f"Invalid EPUB file: {self.file_path}")

    try:
        # Read the EPUB file
        self.book = epub.read_epub(str(self.file_path))

        # Extract metadata
        self.extract_metadata()

        # Process chapters
        all_images = []
        markdown_content = []

        # Add metadata section
        markdown_content.append("---")
        for key, value in self.metadata.items():
            if value:
                markdown_content.append(f"{key}: {value}")
        markdown_content.append("---\n")

        # Process chapters and collect content
        for idx, item in enumerate(
            self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
        ):
            chapter = self.process_chapter(item, idx)
            if chapter:
                self.chapters.append(chapter)
                markdown_content.extend(
                    [f"# {chapter.title}\n", chapter.content, "---\n"]
                )
                all_images.extend(chapter.images)

        # Export markdown content
        markdown_path = self.export_content_to_markdown_file(
            "\n".join(markdown_content)
        )

        # Export images data
        images_data = {"total_images": len(all_images), "images": all_images}
        images_json_path = self.export_images_to_json_file(images_data)

        # Export basic structure data as table
        structure_data = {
            "metadata": self.metadata,
            "chapters": [
                {
                    "title": ch.title,
                    "order": ch.order,
                    "image_count": len(ch.images),
                }
                for ch in self.chapters
            ],
        }
        table_json_path = self.export_table_to_json_file(structure_data)

        return {
            "markdown_path": markdown_path,
            "images_json_path": images_json_path,
            "table_json_path": table_json_path,
            "total_chapters": len(self.chapters),
            "total_images": len(all_images),
        }

    except Exception as e:
        logger.error(f"Error processing EPUB: {str(e)}")
        raise

process_chapter(chapter_item, order)

Process a single chapter from the EPUB.

Parameters:

Name Type Description Default
chapter_item

EpubItem containing chapter content

required
order int

Chapter order number

required

Returns:

Type Description
Optional[Chapter]

Chapter object or None if processing fails

Source code in Docs2KG/digitization/native/ebook.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def process_chapter(self, chapter_item, order: int) -> Optional[Chapter]:
    """
    Process a single chapter from the EPUB.

    Args:
        chapter_item: EpubItem containing chapter content
        order: Chapter order number

    Returns:
        Chapter object or None if processing fails
    """
    try:
        content = chapter_item.get_content().decode("utf-8")
        soup = BeautifulSoup(content, "html.parser")

        title = soup.find("title")
        title = title.text if title else f"Chapter {order}"

        images = self.extract_images_from_chapter(content, f"ch{order}")
        markdown_content = self.text_maker.handle(str(soup))

        return Chapter(
            title=title, content=markdown_content, order=order, images=images
        )

    except Exception as e:
        logger.error(f"Error processing chapter: {str(e)}")
        return None

validate_input(input_data)

Validate if the input is a valid EPUB file.

Parameters:

Name Type Description Default
input_data Union[str, Path]

File path to validate

required

Returns:

Name Type Description
bool bool

True if input is valid EPUB, False otherwise

Source code in Docs2KG/digitization/native/ebook.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def validate_input(self, input_data: Union[str, Path]) -> bool:
    """
    Validate if the input is a valid EPUB file.

    Args:
        input_data: File path to validate

    Returns:
        bool: True if input is valid EPUB, False otherwise
    """
    try:
        path = Path(input_data)
        if not path.exists():
            logger.error(f"File not found: {path}")
            return False

        if path.suffix.lower() != ".epub":
            logger.error(f"Invalid file format: {path.suffix}")
            return False

        # Try to load the EPUB file
        epub.read_epub(str(path))
        return True

    except Exception as e:
        logger.error(f"Error validating EPUB: {str(e)}")
        return False