Skip to content

Base

SemanticKGConstructionBase

Bases: KGConstructionBase

Starting from the layout json, we will have several different ways to extract entities and relationships from the documents

The task will typically into two parts: - Named Entity Recognition: extract entities from the text - input can be: entity list, ontology, or just description - Relationship Extraction: extract relationships between entities

Input will be an array of layout json files, output will be another json with entities and relationships extracted

Source code in Docs2KG/kg_construction/semantic_kg/base.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class SemanticKGConstructionBase(KGConstructionBase):
    """
    Starting from the layout json, we will have several different ways to extract entities and relationships from the documents

    The task will typically into two parts:
    - Named Entity Recognition: extract entities from the text
        - input can be: entity list, ontology, or just description
    - Relationship Extraction: extract relationships between entities

    Input will be an array of layout json files, output will be another json with entities and relationships extracted
    """

    def __init__(self, project_id: str):
        super().__init__(project_id)

    @staticmethod
    def load_layout_kg(layout_kg_path: Path) -> dict:
        """
        Load the layout knowledge graph from a file.

        Args:
            layout_kg_path: Path to the layout knowledge graph file

        Returns:
            dict: Layout knowledge graph
        """
        if not layout_kg_path.exists():
            raise FileNotFoundError(
                f"Layout knowledge graph not found at {layout_kg_path}"
            )
        with open(layout_kg_path, "r") as file:
            layout_kg = json.load(file)
        return layout_kg

    def load_entity_type(self):
        # read from the entity list and ontology json
        # update ontology json based on the entity list if needed
        try:
            entity_list_path = Path(PROJECT_CONFIG.semantic_kg.entity_list)
            if not entity_list_path.exists():
                raise FileNotFoundError(f"Entity list not found at {entity_list_path}")
            with timer(logger, "Loading entity list"):
                df = pd.read_csv(entity_list_path, sep=r",(?=[^,]*$)", engine="python")
            # get all entity types
            entity_type_list = df["entity_type"].unique()
            # read from ontology json
            ontology_json_path = Path(PROJECT_CONFIG.semantic_kg.ontology)
            if not ontology_json_path.exists():
                logger.warning(f"Ontology json not found at {ontology_json_path}")
                ontology_entity_types = []
            else:
                with timer(logger, "Loading ontology json"):
                    with open(ontology_json_path, "r") as f:
                        ontology_json = json.load(f)
                logger.info(f"Ontology json: {ontology_json}")
                ontology = Ontology(**ontology_json)
                # get all entity types from ontology
                ontology_entity_types = ontology.entity_types

            # combine the entity types from entity list and ontology
            self.entity_type_list = list(
                set(entity_type_list) | set(ontology_entity_types)
            )
            # update ontology json if needed
            if len(self.entity_type_list) > len(ontology_entity_types):
                ontology.entity_types = self.entity_type_list
                json_str = ontology.model_dump_json()
                with open(ontology_json_path, "w") as f:
                    f.write(json_str)
        except Exception as e:
            logger.exception(e)

    @staticmethod
    def update_layout_kg(layout_kg_path: Path, layout_kg: dict) -> None:
        """
        Update the layout knowledge graph in a file.

        Args:
            layout_kg_path: Path to the layout knowledge graph file
            layout_kg: Layout knowledge graph to update
        """
        with open(layout_kg_path, "w") as file:
            json.dump(layout_kg, file, indent=2)

    def construct_kg(self, input_data: Any) -> None:
        """
        Construct a semantic knowledge graph from input data.

        Args:
            input_data: Input data to construct the knowledge graph
        """
        pass

    @staticmethod
    def unique_entities(entities):
        unique_entities = []
        seen_entities = set()
        for entity in entities:
            key = (
                entity["start"],
                entity["end"],
                entity["text"],
                entity["label"],
            )
            if key not in seen_entities:
                unique_entities.append(entity)
                seen_entities.add(key)
        return unique_entities

construct_kg(input_data)

Construct a semantic knowledge graph from input data.

Parameters:

Name Type Description Default
input_data Any

Input data to construct the knowledge graph

required
Source code in Docs2KG/kg_construction/semantic_kg/base.py
 98
 99
100
101
102
103
104
105
def construct_kg(self, input_data: Any) -> None:
    """
    Construct a semantic knowledge graph from input data.

    Args:
        input_data: Input data to construct the knowledge graph
    """
    pass

load_layout_kg(layout_kg_path) staticmethod

Load the layout knowledge graph from a file.

Parameters:

Name Type Description Default
layout_kg_path Path

Path to the layout knowledge graph file

required

Returns:

Name Type Description
dict dict

Layout knowledge graph

Source code in Docs2KG/kg_construction/semantic_kg/base.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@staticmethod
def load_layout_kg(layout_kg_path: Path) -> dict:
    """
    Load the layout knowledge graph from a file.

    Args:
        layout_kg_path: Path to the layout knowledge graph file

    Returns:
        dict: Layout knowledge graph
    """
    if not layout_kg_path.exists():
        raise FileNotFoundError(
            f"Layout knowledge graph not found at {layout_kg_path}"
        )
    with open(layout_kg_path, "r") as file:
        layout_kg = json.load(file)
    return layout_kg

update_layout_kg(layout_kg_path, layout_kg) staticmethod

Update the layout knowledge graph in a file.

Parameters:

Name Type Description Default
layout_kg_path Path

Path to the layout knowledge graph file

required
layout_kg dict

Layout knowledge graph to update

required
Source code in Docs2KG/kg_construction/semantic_kg/base.py
86
87
88
89
90
91
92
93
94
95
96
@staticmethod
def update_layout_kg(layout_kg_path: Path, layout_kg: dict) -> None:
    """
    Update the layout knowledge graph in a file.

    Args:
        layout_kg_path: Path to the layout knowledge graph file
        layout_kg: Layout knowledge graph to update
    """
    with open(layout_kg_path, "w") as file:
        json.dump(layout_kg, file, indent=2)