Skip to content

Pdf layout kg

PDFLayoutKG

Layout Knowledge Graph

This is for one pdf file

What we will link in the layout knowledge graph:

  • Document KG
    • Input is the Markdown JSON file
    • The context order will be preserved within the Tree
  • Link Image to Page
  • Link Table to Page
  • Link Image to Context (Find Nearby Context, then Map back to the Tree)
  • Link Table to Context (Same, Find Caption, Nearby Context)
Source code in Docs2KG/kg/pdf_layout_kg.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
class PDFLayoutKG:
    """
    Layout Knowledge Graph

    This is for one pdf file


    What we will link in the layout knowledge graph:

    - Document KG
        - Input is the Markdown JSON file
        - The context order will be preserved within the Tree
    - Link Image to Page
    - Link Table to Page
    - Link Image to Context (Find Nearby Context, then Map back to the Tree)
    - Link Table to Context (Same, Find Caption, Nearby Context)
    """

    def __init__(
        self,
        folder_path: Path,
        input_format: str = "pdf_exported",
    ):
        """
        Initialize the class with the pdf file


        Args:
            folder_path (Path): The path to the pdf file

        """
        self.folder_path = folder_path
        self.kg_folder = self.folder_path / "kg"
        if not self.kg_folder.exists():
            self.kg_folder.mkdir(parents=True, exist_ok=True)
        self.kg_json = {}
        self.metadata = json.load((self.folder_path / "metadata.json").open())
        self.sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
        self.input_format = input_format

    def create_kg(self):
        """
        Create the layout knowledge graph
        """
        self.document_kg()
        if self.input_format == "pdf_exported":
            self.link_image_to_page()
            self.link_table_to_page()
            self.link_image_to_context()
            self.link_table_to_context()
        if self.input_format == "pdf_scanned":
            # add page image
            self.link_page_image_to_page()

    def document_kg(self):
        """
        Construct the layout knowledge graph skeleton first

        We will require the md.json.csv file with the following columns:

        - layout_json
        """
        # 1. add the document node

        self.kg_json = {
            "node_type": "document",
            "uuid": str(uuid4()),
            "node_properties": self.metadata,
            "children": [],
        }

        # 2. add page nodes
        pages_json = []
        text_folder = self.folder_path / "texts"
        md_json_csv = text_folder / "md.json.csv"
        texts_json_df = pd.read_csv(md_json_csv)
        columns = texts_json_df.columns.tolist()
        logger.debug(f"Columns: {columns}")
        # we will focus on the layout json

        for index, row in texts_json_df.iterrows():
            logger.info(f"Processing page_index {index}")
            logger.debug(row["layout_json"])
            try:
                layout_json = json.loads(row["layout_json"])
                # recursively decompose the layout json and add to proper level children

                page_json = {
                    "node_type": "page",
                    "uuid": str(uuid4()),
                    "node_properties": {
                        "page_number": row["page_number"],
                        "page_text": row["text"],
                    },
                    "children": [self.recursive_layout_json(layout_json)],
                }
                pages_json.append(page_json)
            except Exception as e:
                logger.error(f"Error in row {index}: {e}")
                logger.error(row["layout_json"])
                logger.exception(e)
                # if this is an unhandled error
                # we should still keep all data for this page, so we will construct a page with everything we have
                page_json = {
                    "node_type": "page",
                    "uuid": str(uuid4()),
                    "node_properties": {
                        "page_number": row["page_number"],
                        "page_text": row["text"],
                    },
                    "children": [],
                }
                pages_json.append(page_json)

        self.kg_json["children"] = pages_json
        self.export_kg()

    def link_image_to_page(self):
        """
        Loop the image, assign it under the proper page
        If the page not exist, then add a page node
        """
        block_images = self.folder_path / "images" / "blocks_images.csv"
        if empty_check(block_images):
            return
        images_df = pd.read_csv(self.folder_path / "images" / "blocks_images.csv")
        for index, row in images_df.iterrows():
            page_number = row["page_number"]
            page_node = self.get_page_node(page_number)
            if not page_node:
                logger.info(f"Page {page_number} not found, adding a new page node")
                page_node = {
                    "node_type": "page",
                    "uuid": str(uuid4()),
                    "node_properties": {
                        "page_number": page_number,
                        "page_text": "",
                    },
                    "children": [],
                }
                self.kg_json["children"].append(page_node)
            image_node = {
                "node_type": "image",
                "uuid": str(uuid4()),
                "node_properties": {
                    "image_path": row["image_path"],
                    "image_block_number": row["block_number"],
                    "bbox": row["bbox"],
                },
                "children": [],
            }

            page_node["children"].append(image_node)

        self.export_kg()

    def link_page_image_to_page(self):
        page_images_file = self.folder_path / "images" / "page_images.csv"
        page_images_df = pd.read_csv(page_images_file)
        for index, row in page_images_df.iterrows():
            page_number = row["page_number"]
            page_node = self.get_page_node(page_number)
            if not page_node:
                logger.info(f"Page {page_number} not found, adding a new page node")
                page_node = {
                    "node_type": "page",
                    "uuid": str(uuid4()),
                    "node_properties": {
                        "page_number": page_number,
                        "page_text": "",
                    },
                    "children": [],
                }
                self.kg_json["children"].append(page_node)
            image_node = {
                "node_type": "page_image",
                "uuid": str(uuid4()),
                "node_properties": {
                    "image_path": row["image_path"],
                },
                "children": [],
            }
            page_node["children"].append(image_node)

    def link_table_to_page(self):
        """
        Link the table file to proper page.

        Link to proper position in the page will be in function

        `link_table_to_context`

        """
        tables = self.folder_path / "tables" / "tables.csv"
        if empty_check(tables):
            return
        table_df = pd.read_csv(self.folder_path / "tables" / "tables.csv")
        for index, row in table_df.iterrows():
            logger.info(f"Processing table {index}")
            page_node = self.get_page_node(row["page_index"])
            page_node["children"].append(
                {
                    "node_type": "table_csv",
                    "uuid": str(uuid4()),
                    "node_properties": {
                        "table_path": row["file_path"],
                        "table_index": row["table_index"],
                        "bbox": row["bbox"],
                    },
                }
            )

        self.export_kg()

    def link_image_to_context(self):
        """
        Construct the image knowledge graph
        """
        block_images = self.folder_path / "images" / "blocks_images.csv"
        if empty_check(block_images):
            return
        images_df = pd.read_csv(self.folder_path / "images" / "blocks_images.csv")
        text_block_df = pd.read_csv(self.folder_path / "texts" / "blocks_texts.csv")
        logger.debug(text_block_df.columns.tolist())
        for index, row in images_df.iterrows():
            page_number = row["page_number"]

            logger.info(f"Processing image {index} in page {page_number}")
            page_node = self.get_page_node(page_number)
            # get the text blocks that are in the same page
            text_blocks = text_block_df[
                text_block_df["page_number"] == page_number
            ].copy(deep=True)
            # clean the text_block without text after text clean all space
            text_blocks = text_blocks[
                text_blocks["text"].str.strip() != ""
            ].reset_index()
            image_bbox = row["bbox"]
            logger.debug(f"Image bbox: {image_bbox}")
            text_blocks_bbox = text_blocks["bbox"].tolist()
            nearby_text_blocks = BlockFinder.find_closest_blocks(
                image_bbox, text_blocks_bbox
            )
            nearby_info = []
            nearby_info_dict = {}
            for key, value in nearby_text_blocks.items():
                if value is not None:
                    text_block = text_blocks.loc[value]
                    logger.debug(text_block)
                    nearby_info.append(
                        {
                            "node_type": "text_block",
                            "uuid": str(uuid4()),
                            "node_properties": {
                                "text_block_bbox": text_block["bbox"],
                                "content": text_block["text"],
                                "position": key,
                                "text_block_number": int(text_block["block_number"]),
                            },
                            "children": [],
                        }
                    )
                    nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
            """
            We also need to loop the nodes within this page
            if the text block is highly similar to a content node, then we can link them together

            How we solve this problem?

            Recursively loop the children of the page node, if the text block is highly similar to the content
            then we can link them together

            So the function input should be the page_node dict, and the nearby_info_dict
            Output should be the updated nearby_info_dict with the linked uuid
            """
            nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
            logger.info(nearby_info_dict)

            for item in nearby_info:
                key = item["node_properties"]["position"]
                item["linkage"] = nearby_info_dict[key]["uuids"]

            """
            find the image node
            add the nearby_info to the children
            the image node will have the image_block_number to identify it
            """
            for child in page_node["children"]:
                if (
                    child["node_type"] == "image"
                    and child["node_properties"]["image_block_number"]
                    == row["block_number"]
                ):
                    child["children"] = nearby_info
                    break

        self.export_kg()

    def link_table_to_context(self):
        """
        Link the table to the context

        We have two ways to make it work

        1. Loop the table, and for tree leaf within the page node, if it is tagged as table, then link them together
        2. We have bbox of the table, so we can find the nearby text block, and link them together

        """
        tables = self.folder_path / "tables" / "tables.csv"
        if empty_check(tables):
            return
        table_df = pd.read_csv(self.folder_path / "tables" / "tables.csv")
        text_block_df = pd.read_csv(self.folder_path / "texts" / "blocks_texts.csv")
        for index, row in table_df.iterrows():
            page_number = row["page_index"]
            page_node = self.get_page_node(page_number)
            table_bbox = row["bbox"]
            text_blocks = text_block_df[
                text_block_df["page_number"] == page_number
            ].copy(deep=True)
            text_blocks = text_blocks[
                text_blocks["text"].str.strip() != ""
            ].reset_index()
            text_blocks_bbox = text_blocks["bbox"].tolist()
            nearby_text_blocks = BlockFinder.find_closest_blocks(
                table_bbox, text_blocks_bbox
            )
            nearby_info = []
            nearby_info_dict = {}
            for key, value in nearby_text_blocks.items():
                if value is not None:
                    text_block = text_blocks.loc[value]
                    nearby_info.append(
                        {
                            "node_type": "text_block",
                            "uuid": str(uuid4()),
                            "node_properties": {
                                "text_block_bbox": text_block["bbox"],
                                "content": text_block["text"],
                                "position": key,
                                "text_block_number": int(text_block["block_number"]),
                            },
                            "children": [],
                        }
                    )
                    nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
            nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
            for item in nearby_info:
                key = item["node_properties"]["position"]
                item["linkage"] = nearby_info_dict[key]["uuids"]

            # the second matching method, loop the tree node of the page
            table_nodes = self.get_specific_tag_nodes(page_node, "table")
            page_tree_table_node = None
            # matched table nodes
            table_index = row["table_index"]
            if len(table_nodes) >= table_index:
                page_tree_table_node = table_nodes[table_index - 1]

            # give table node a linkage to the table_node
            for child in page_node["children"]:
                if (
                    child["node_type"] == "table_csv"
                    and child["node_properties"]["table_index"] == row["table_index"]
                ):
                    child["children"] = nearby_info
                    if page_tree_table_node:
                        # add the linkage from table_csv to table_tree_node
                        child["linkage"] = [page_tree_table_node["uuid"]]
                    break

        self.export_kg()

    def export_kg(self) -> None:
        """
        Export the knowledge graph to json file
        """
        with open(self.kg_folder / "layout_kg.json", "w") as f:
            json.dump(self.kg_json, f, indent=2)

    def load_kg(self):
        """
        Load the knowledge graph from JSON
        """
        with open(self.kg_folder / "layout_kg.json", "r") as f:
            self.kg_json = json.load(f)

    def get_page_node(self, page_number: int) -> Optional[dict]:
        """
        Get the page node

        Args:
            page_number (int): The page number

        Returns:
            page_node (dict): The page node
        """
        for page in self.kg_json["children"]:
            if str(page["node_properties"]["page_number"]) == str(page_number):
                return page
        logger.error(f"Page {page_number} not found")
        return None

    def get_specific_tag_nodes(self, tree_json: dict, tag: str) -> list:
        """
        Get the specific tag nodes from the page node

        Args:
            tree_json (dict): The tree_json
            tag (str): The tag to find

        Returns:
            list: The list of nodes with the specific tag
        """
        nodes = []
        if "children" not in tree_json:
            return nodes
        for child in tree_json["children"]:
            if child["node_type"] == tag:
                nodes.append(child)
            nodes.extend(self.get_specific_tag_nodes(child, tag))
        return nodes

    @classmethod
    def recursive_layout_json(cls, layout_json: dict) -> dict:
        """
        Recursively processes layout JSON to construct a tree structure, annotating each node with
        a unique identifier and handling specific HTML structures like tables.

        Args:
            layout_json (dict): The layout JSON object to process.

        Returns:
            dict: A tree-like JSON object with added metadata.
        """
        try:
            return cls._process_node(layout_json)
        except Exception as e:
            logger.exception("Failed to process layout JSON")
            return cls._error_node(layout_json, str(e))

    @classmethod
    def _process_node(cls, node: dict) -> dict:
        """
        Process a single node in the layout JSON.

        Args:
            node (dict): The node to process.

        Returns:
            dict: The processed node.
        """
        tag = node.get("tag")
        if tag in HTML_TAGS:
            return cls._create_tree_node(tag, node)

        # If 'tag' is missing, attempt to find a valid HTML tag in the keys
        for key in node:
            if key.strip() in HTML_TAGS:
                return cls._create_tree_node(key, node)

        # If no valid tag is found, handle as an untagged node
        return cls._untagged_node(node)

    @classmethod
    def _create_tree_node(cls, tag: str, node: dict) -> dict:
        """
        Create a tree node for the JSON structure.

        Args:
            tag (str): The HTML tag of the node.
            node (dict): The original node data.

        Returns:
            dict: A structured tree node.
        """
        node_uuid = str(uuid4())
        node_properties = {
            "content": node.get("content", ""),
            "text": json.dumps(node) if tag == "table" else "",
            "records": node.get("children", []) if tag == "table" else [],
        }
        children = [cls._process_node(child) for child in node.get("children", [])]

        return {
            "node_type": tag,
            "uuid": node_uuid,
            "node_properties": node_properties,
            "children": children,
        }

    @classmethod
    def _untagged_node(cls, node: dict) -> dict:
        """
        Handles nodes without a recognized HTML tag.

        Args:
            node (dict): The node to handle.

        Returns:
            dict: A default structured node indicating an untagged element.
        """
        return {
            "node_type": "untagged",
            "uuid": str(uuid4()),
            "node_properties": {"content": json.dumps(node)},
            "children": [],
        }

    @classmethod
    def _error_node(cls, node: dict, error_message: str) -> dict:
        """
        Create an error node when processing fails.

        Args:
            node (dict): The node that caused the error.
            error_message (str): A message describing the error.

        Returns:
            dict: An error node.
        """
        return {
            "node_type": "unknown",
            "uuid": str(uuid4()),
            "node_properties": {"content": json.dumps(node), "error": error_message},
            "children": [],
        }

    def link_image_to_tree_node(self, page_node: dict, nearby_info_dict: dict) -> dict:
        """
        Link the image to the tree node

        - Loop the children of the page node
        - If the text block is highly similar to the content, add the uuid to the nearby_info_dict

        Match method:
            − exact match
            - fuzzy match

        Args:
            page_node (dict): The page node
            nearby_info_dict (dict): The nearby info dict

        Returns:
            nearby_info_dict (dict): The updated nearby info dict
        """

        if "children" not in page_node:
            return nearby_info_dict
        for child in page_node["children"]:
            # get the text
            content = child["node_properties"].get("content", "")
            nearby_info_dict = self.link_image_to_tree_node(child, nearby_info_dict)
            if content.strip() == "":
                continue
            for key, value in nearby_info_dict.items():
                if content.strip() == value["content"].strip():
                    value["uuids"].append(child["uuid"])
                elif self.text_bert_match(content, value["content"]):
                    value["uuids"].append(child["uuid"])

        return nearby_info_dict

    def text_bert_match(
        self, text1: str, text2: str, threshold_value: float = 0.8
    ) -> bool:
        """
        Fuzzy match the text

        Args:
            text1 (str): The first text
            text2 (str): The second text
            threshold_value (float): The threshold value

        Returns:
            bool: Whether the text is similar
        """
        embedding1 = self.sentence_transformer.encode([text1])
        embedding2 = self.sentence_transformer.encode([text2])
        similarity = self.sentence_transformer.similarity(embedding1, embedding2)

        # get the first value from the similarity matrix, and to float
        similarity = similarity[0].item()
        matched = similarity > threshold_value

        if matched:
            logger.debug(f"Matched: {text1} | {text2}")
            logger.debug(f"Similarity: {similarity}")
        return matched

__init__(folder_path, input_format='pdf_exported')

Initialize the class with the pdf file

Parameters:

Name Type Description Default
folder_path Path

The path to the pdf file

required
Source code in Docs2KG/kg/pdf_layout_kg.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    folder_path: Path,
    input_format: str = "pdf_exported",
):
    """
    Initialize the class with the pdf file


    Args:
        folder_path (Path): The path to the pdf file

    """
    self.folder_path = folder_path
    self.kg_folder = self.folder_path / "kg"
    if not self.kg_folder.exists():
        self.kg_folder.mkdir(parents=True, exist_ok=True)
    self.kg_json = {}
    self.metadata = json.load((self.folder_path / "metadata.json").open())
    self.sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
    self.input_format = input_format

create_kg()

Create the layout knowledge graph

Source code in Docs2KG/kg/pdf_layout_kg.py
57
58
59
60
61
62
63
64
65
66
67
68
69
def create_kg(self):
    """
    Create the layout knowledge graph
    """
    self.document_kg()
    if self.input_format == "pdf_exported":
        self.link_image_to_page()
        self.link_table_to_page()
        self.link_image_to_context()
        self.link_table_to_context()
    if self.input_format == "pdf_scanned":
        # add page image
        self.link_page_image_to_page()

document_kg()

Construct the layout knowledge graph skeleton first

We will require the md.json.csv file with the following columns:

  • layout_json
Source code in Docs2KG/kg/pdf_layout_kg.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def document_kg(self):
    """
    Construct the layout knowledge graph skeleton first

    We will require the md.json.csv file with the following columns:

    - layout_json
    """
    # 1. add the document node

    self.kg_json = {
        "node_type": "document",
        "uuid": str(uuid4()),
        "node_properties": self.metadata,
        "children": [],
    }

    # 2. add page nodes
    pages_json = []
    text_folder = self.folder_path / "texts"
    md_json_csv = text_folder / "md.json.csv"
    texts_json_df = pd.read_csv(md_json_csv)
    columns = texts_json_df.columns.tolist()
    logger.debug(f"Columns: {columns}")
    # we will focus on the layout json

    for index, row in texts_json_df.iterrows():
        logger.info(f"Processing page_index {index}")
        logger.debug(row["layout_json"])
        try:
            layout_json = json.loads(row["layout_json"])
            # recursively decompose the layout json and add to proper level children

            page_json = {
                "node_type": "page",
                "uuid": str(uuid4()),
                "node_properties": {
                    "page_number": row["page_number"],
                    "page_text": row["text"],
                },
                "children": [self.recursive_layout_json(layout_json)],
            }
            pages_json.append(page_json)
        except Exception as e:
            logger.error(f"Error in row {index}: {e}")
            logger.error(row["layout_json"])
            logger.exception(e)
            # if this is an unhandled error
            # we should still keep all data for this page, so we will construct a page with everything we have
            page_json = {
                "node_type": "page",
                "uuid": str(uuid4()),
                "node_properties": {
                    "page_number": row["page_number"],
                    "page_text": row["text"],
                },
                "children": [],
            }
            pages_json.append(page_json)

    self.kg_json["children"] = pages_json
    self.export_kg()

export_kg()

Export the knowledge graph to json file

Source code in Docs2KG/kg/pdf_layout_kg.py
390
391
392
393
394
395
def export_kg(self) -> None:
    """
    Export the knowledge graph to json file
    """
    with open(self.kg_folder / "layout_kg.json", "w") as f:
        json.dump(self.kg_json, f, indent=2)

get_page_node(page_number)

Get the page node

Parameters:

Name Type Description Default
page_number int

The page number

required

Returns:

Name Type Description
page_node dict

The page node

Source code in Docs2KG/kg/pdf_layout_kg.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
def get_page_node(self, page_number: int) -> Optional[dict]:
    """
    Get the page node

    Args:
        page_number (int): The page number

    Returns:
        page_node (dict): The page node
    """
    for page in self.kg_json["children"]:
        if str(page["node_properties"]["page_number"]) == str(page_number):
            return page
    logger.error(f"Page {page_number} not found")
    return None

get_specific_tag_nodes(tree_json, tag)

Get the specific tag nodes from the page node

Parameters:

Name Type Description Default
tree_json dict

The tree_json

required
tag str

The tag to find

required

Returns:

Name Type Description
list list

The list of nodes with the specific tag

Source code in Docs2KG/kg/pdf_layout_kg.py
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
def get_specific_tag_nodes(self, tree_json: dict, tag: str) -> list:
    """
    Get the specific tag nodes from the page node

    Args:
        tree_json (dict): The tree_json
        tag (str): The tag to find

    Returns:
        list: The list of nodes with the specific tag
    """
    nodes = []
    if "children" not in tree_json:
        return nodes
    for child in tree_json["children"]:
        if child["node_type"] == tag:
            nodes.append(child)
        nodes.extend(self.get_specific_tag_nodes(child, tag))
    return nodes

Construct the image knowledge graph

Source code in Docs2KG/kg/pdf_layout_kg.py
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def link_image_to_context(self):
    """
    Construct the image knowledge graph
    """
    block_images = self.folder_path / "images" / "blocks_images.csv"
    if empty_check(block_images):
        return
    images_df = pd.read_csv(self.folder_path / "images" / "blocks_images.csv")
    text_block_df = pd.read_csv(self.folder_path / "texts" / "blocks_texts.csv")
    logger.debug(text_block_df.columns.tolist())
    for index, row in images_df.iterrows():
        page_number = row["page_number"]

        logger.info(f"Processing image {index} in page {page_number}")
        page_node = self.get_page_node(page_number)
        # get the text blocks that are in the same page
        text_blocks = text_block_df[
            text_block_df["page_number"] == page_number
        ].copy(deep=True)
        # clean the text_block without text after text clean all space
        text_blocks = text_blocks[
            text_blocks["text"].str.strip() != ""
        ].reset_index()
        image_bbox = row["bbox"]
        logger.debug(f"Image bbox: {image_bbox}")
        text_blocks_bbox = text_blocks["bbox"].tolist()
        nearby_text_blocks = BlockFinder.find_closest_blocks(
            image_bbox, text_blocks_bbox
        )
        nearby_info = []
        nearby_info_dict = {}
        for key, value in nearby_text_blocks.items():
            if value is not None:
                text_block = text_blocks.loc[value]
                logger.debug(text_block)
                nearby_info.append(
                    {
                        "node_type": "text_block",
                        "uuid": str(uuid4()),
                        "node_properties": {
                            "text_block_bbox": text_block["bbox"],
                            "content": text_block["text"],
                            "position": key,
                            "text_block_number": int(text_block["block_number"]),
                        },
                        "children": [],
                    }
                )
                nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
        """
        We also need to loop the nodes within this page
        if the text block is highly similar to a content node, then we can link them together

        How we solve this problem?

        Recursively loop the children of the page node, if the text block is highly similar to the content
        then we can link them together

        So the function input should be the page_node dict, and the nearby_info_dict
        Output should be the updated nearby_info_dict with the linked uuid
        """
        nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
        logger.info(nearby_info_dict)

        for item in nearby_info:
            key = item["node_properties"]["position"]
            item["linkage"] = nearby_info_dict[key]["uuids"]

        """
        find the image node
        add the nearby_info to the children
        the image node will have the image_block_number to identify it
        """
        for child in page_node["children"]:
            if (
                child["node_type"] == "image"
                and child["node_properties"]["image_block_number"]
                == row["block_number"]
            ):
                child["children"] = nearby_info
                break

    self.export_kg()

Loop the image, assign it under the proper page If the page not exist, then add a page node

Source code in Docs2KG/kg/pdf_layout_kg.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def link_image_to_page(self):
    """
    Loop the image, assign it under the proper page
    If the page not exist, then add a page node
    """
    block_images = self.folder_path / "images" / "blocks_images.csv"
    if empty_check(block_images):
        return
    images_df = pd.read_csv(self.folder_path / "images" / "blocks_images.csv")
    for index, row in images_df.iterrows():
        page_number = row["page_number"]
        page_node = self.get_page_node(page_number)
        if not page_node:
            logger.info(f"Page {page_number} not found, adding a new page node")
            page_node = {
                "node_type": "page",
                "uuid": str(uuid4()),
                "node_properties": {
                    "page_number": page_number,
                    "page_text": "",
                },
                "children": [],
            }
            self.kg_json["children"].append(page_node)
        image_node = {
            "node_type": "image",
            "uuid": str(uuid4()),
            "node_properties": {
                "image_path": row["image_path"],
                "image_block_number": row["block_number"],
                "bbox": row["bbox"],
            },
            "children": [],
        }

        page_node["children"].append(image_node)

    self.export_kg()

Link the image to the tree node

  • Loop the children of the page node
  • If the text block is highly similar to the content, add the uuid to the nearby_info_dict
Match method

− exact match - fuzzy match

Parameters:

Name Type Description Default
page_node dict

The page node

required
nearby_info_dict dict

The nearby info dict

required

Returns:

Name Type Description
nearby_info_dict dict

The updated nearby info dict

Source code in Docs2KG/kg/pdf_layout_kg.py
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
def link_image_to_tree_node(self, page_node: dict, nearby_info_dict: dict) -> dict:
    """
    Link the image to the tree node

    - Loop the children of the page node
    - If the text block is highly similar to the content, add the uuid to the nearby_info_dict

    Match method:
        − exact match
        - fuzzy match

    Args:
        page_node (dict): The page node
        nearby_info_dict (dict): The nearby info dict

    Returns:
        nearby_info_dict (dict): The updated nearby info dict
    """

    if "children" not in page_node:
        return nearby_info_dict
    for child in page_node["children"]:
        # get the text
        content = child["node_properties"].get("content", "")
        nearby_info_dict = self.link_image_to_tree_node(child, nearby_info_dict)
        if content.strip() == "":
            continue
        for key, value in nearby_info_dict.items():
            if content.strip() == value["content"].strip():
                value["uuids"].append(child["uuid"])
            elif self.text_bert_match(content, value["content"]):
                value["uuids"].append(child["uuid"])

    return nearby_info_dict

Link the table to the context

We have two ways to make it work

  1. Loop the table, and for tree leaf within the page node, if it is tagged as table, then link them together
  2. We have bbox of the table, so we can find the nearby text block, and link them together
Source code in Docs2KG/kg/pdf_layout_kg.py
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
def link_table_to_context(self):
    """
    Link the table to the context

    We have two ways to make it work

    1. Loop the table, and for tree leaf within the page node, if it is tagged as table, then link them together
    2. We have bbox of the table, so we can find the nearby text block, and link them together

    """
    tables = self.folder_path / "tables" / "tables.csv"
    if empty_check(tables):
        return
    table_df = pd.read_csv(self.folder_path / "tables" / "tables.csv")
    text_block_df = pd.read_csv(self.folder_path / "texts" / "blocks_texts.csv")
    for index, row in table_df.iterrows():
        page_number = row["page_index"]
        page_node = self.get_page_node(page_number)
        table_bbox = row["bbox"]
        text_blocks = text_block_df[
            text_block_df["page_number"] == page_number
        ].copy(deep=True)
        text_blocks = text_blocks[
            text_blocks["text"].str.strip() != ""
        ].reset_index()
        text_blocks_bbox = text_blocks["bbox"].tolist()
        nearby_text_blocks = BlockFinder.find_closest_blocks(
            table_bbox, text_blocks_bbox
        )
        nearby_info = []
        nearby_info_dict = {}
        for key, value in nearby_text_blocks.items():
            if value is not None:
                text_block = text_blocks.loc[value]
                nearby_info.append(
                    {
                        "node_type": "text_block",
                        "uuid": str(uuid4()),
                        "node_properties": {
                            "text_block_bbox": text_block["bbox"],
                            "content": text_block["text"],
                            "position": key,
                            "text_block_number": int(text_block["block_number"]),
                        },
                        "children": [],
                    }
                )
                nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
        nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
        for item in nearby_info:
            key = item["node_properties"]["position"]
            item["linkage"] = nearby_info_dict[key]["uuids"]

        # the second matching method, loop the tree node of the page
        table_nodes = self.get_specific_tag_nodes(page_node, "table")
        page_tree_table_node = None
        # matched table nodes
        table_index = row["table_index"]
        if len(table_nodes) >= table_index:
            page_tree_table_node = table_nodes[table_index - 1]

        # give table node a linkage to the table_node
        for child in page_node["children"]:
            if (
                child["node_type"] == "table_csv"
                and child["node_properties"]["table_index"] == row["table_index"]
            ):
                child["children"] = nearby_info
                if page_tree_table_node:
                    # add the linkage from table_csv to table_tree_node
                    child["linkage"] = [page_tree_table_node["uuid"]]
                break

    self.export_kg()

Link the table file to proper page.

Link to proper position in the page will be in function

link_table_to_context

Source code in Docs2KG/kg/pdf_layout_kg.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def link_table_to_page(self):
    """
    Link the table file to proper page.

    Link to proper position in the page will be in function

    `link_table_to_context`

    """
    tables = self.folder_path / "tables" / "tables.csv"
    if empty_check(tables):
        return
    table_df = pd.read_csv(self.folder_path / "tables" / "tables.csv")
    for index, row in table_df.iterrows():
        logger.info(f"Processing table {index}")
        page_node = self.get_page_node(row["page_index"])
        page_node["children"].append(
            {
                "node_type": "table_csv",
                "uuid": str(uuid4()),
                "node_properties": {
                    "table_path": row["file_path"],
                    "table_index": row["table_index"],
                    "bbox": row["bbox"],
                },
            }
        )

    self.export_kg()

load_kg()

Load the knowledge graph from JSON

Source code in Docs2KG/kg/pdf_layout_kg.py
397
398
399
400
401
402
def load_kg(self):
    """
    Load the knowledge graph from JSON
    """
    with open(self.kg_folder / "layout_kg.json", "r") as f:
        self.kg_json = json.load(f)

recursive_layout_json(layout_json) classmethod

Recursively processes layout JSON to construct a tree structure, annotating each node with a unique identifier and handling specific HTML structures like tables.

Parameters:

Name Type Description Default
layout_json dict

The layout JSON object to process.

required

Returns:

Name Type Description
dict dict

A tree-like JSON object with added metadata.

Source code in Docs2KG/kg/pdf_layout_kg.py
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
@classmethod
def recursive_layout_json(cls, layout_json: dict) -> dict:
    """
    Recursively processes layout JSON to construct a tree structure, annotating each node with
    a unique identifier and handling specific HTML structures like tables.

    Args:
        layout_json (dict): The layout JSON object to process.

    Returns:
        dict: A tree-like JSON object with added metadata.
    """
    try:
        return cls._process_node(layout_json)
    except Exception as e:
        logger.exception("Failed to process layout JSON")
        return cls._error_node(layout_json, str(e))

text_bert_match(text1, text2, threshold_value=0.8)

Fuzzy match the text

Parameters:

Name Type Description Default
text1 str

The first text

required
text2 str

The second text

required
threshold_value float

The threshold value

0.8

Returns:

Name Type Description
bool bool

Whether the text is similar

Source code in Docs2KG/kg/pdf_layout_kg.py
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
def text_bert_match(
    self, text1: str, text2: str, threshold_value: float = 0.8
) -> bool:
    """
    Fuzzy match the text

    Args:
        text1 (str): The first text
        text2 (str): The second text
        threshold_value (float): The threshold value

    Returns:
        bool: Whether the text is similar
    """
    embedding1 = self.sentence_transformer.encode([text1])
    embedding2 = self.sentence_transformer.encode([text2])
    similarity = self.sentence_transformer.similarity(embedding1, embedding2)

    # get the first value from the similarity matrix, and to float
    similarity = similarity[0].item()
    matched = similarity > threshold_value

    if matched:
        logger.debug(f"Matched: {text1} | {text2}")
        logger.debug(f"Similarity: {similarity}")
    return matched