Skip to content

Dynamic schema

DynamicSchema

For the unified knowledge graph, especially the semantic part, the schema is dynamic.

So which will require two things:

  • From top-down methodological perspective, we can use ontology based way to implement the schema.
    • However, it will require quite a lot of pre-work before we can embrace the usage of LLM
  • So we use it from another perspective, which is bottom-up.
    • We will have the defined schema first, and then merge the schema
    • The merge process will include two parts:
      • Machine based, automatic merge
        • Frequency based merge
        • Similarity based merge
        • Other strategies
      • Human based, manual merge
Source code in Docs2KG/kg/dynamic_schema.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class DynamicSchema:
    """
    For the unified knowledge graph, especially the semantic part, the schema is dynamic.

    So which will require two things:

    - From top-down methodological perspective, we can use ontology based way to implement the schema.
        - However, it will require quite a lot of pre-work before we can embrace the usage of LLM
    - So we use it from another perspective, which is bottom-up.
        - We will have the defined schema first, and then merge the schema
        - The merge process will include two parts:
            - Machine based, automatic merge
                - Frequency based merge
                - Similarity based merge
                - Other strategies
            - Human based, manual merge

    """

    def __init__(
        self, kg_json_file: Path, merge_freq: int = 10, merge_similarity: float = 0.98
    ):
        """
        Initialize the dynamic schema class
        Args:
            kg_json_file (Path): The path of the knowledge graph json file
            merge_freq (int): The frequency of the label, if it is lower than this, we will ignore it
            merge_similarity (float): The similarity threshold for the merge

        Returns:

        """
        self.kg_json_file = kg_json_file
        self.kg_json = json.load(kg_json_file.open())
        self.merge_freq = merge_freq
        self.merge_similarity = merge_similarity
        self.nodes_freq = {}
        # use bert to calculate the similarity
        self.similarity_model = None
        self.tokenizer = None

    def schema_extraction(self):
        """
        Extract the schema from the knowledge graph
        """
        nodes = self.kg_json["nodes"]

        node_df = pd.DataFrame(nodes)
        # extract the unique labels and its occurrence (labels is a list field)
        unique_labels = node_df["labels"].explode().value_counts()
        logger.info(f"Unique labels: {unique_labels}")
        self.nodes_freq = unique_labels.to_dict()

    def schema_freq_merge(self) -> dict:
        """
        Replace the label under the threshold into text_block label

        Returns:
            merge_mapping (dict): The mapping of the merge, key is the original label, value is the new label
        """

        # for the one with lower occurrence, we can ignore them
        merge_mapping = {}
        for key, value in self.nodes_freq.items():
            if key.lower() in HTML_TAGS:
                continue
            if value < self.merge_freq:
                merge_mapping[key] = "text_block"
        logger.debug(f"Merge mapping: {merge_mapping} based on frequency")
        return merge_mapping

    def schema_similarity_merge(self) -> dict:
        """
        Merge the schema based on the similarity

        Returns:
            merge_mapping (dict): The mapping of the merge, key is the original label, value is the new label
        """
        merge_mapping = {}
        # calculate the pairwise similarity for all the labels using the sentence transformer
        # for the one with high similarity, we can merge them
        # so, we should first construct a 2D matrix
        if self.similarity_model is None:
            # use bert to calculate the key similarity
            self.similarity_model = BertModel.from_pretrained("bert-base-uncased")
            self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        def encode_label(label):
            inputs = self.tokenizer(label, return_tensors="pt")
            with torch.no_grad():
                outputs = self.similarity_model(**inputs)
            return outputs.last_hidden_state[:, 0, :].numpy().flatten()

        labels = list(self.nodes_freq.keys())
        logger.debug(labels)
        label_matrix = [encode_label([label]) for label in labels]
        label_matrix = np.array(label_matrix)

        for i in range(len(labels)):
            for j in range(len(labels)):
                if i == j:
                    continue
                if (
                    cosine_similarity(
                        label_matrix[i].reshape(1, -1), label_matrix[j].reshape(1, -1)
                    )
                    > self.merge_similarity
                ):
                    if labels[i].lower() in HTML_TAGS or labels[j].lower() in HTML_TAGS:
                        continue
                    merge_mapping[labels[i]] = labels[j]

        # then we can calculate the similarity
        logger.info(f"Merge mapping: {merge_mapping} based on similarity")
        return merge_mapping

    def human_in_the_loop_input(self) -> dict:
        """
        Convert the schema into the dict
        {
            key: number of occurrence
            ...
        }

        Then human will do the decision based on the value, to do the mapping
        """
        logger.info(f"Schema: {self.nodes_freq}")
        return self.nodes_freq

__init__(kg_json_file, merge_freq=10, merge_similarity=0.98)

Initialize the dynamic schema class Args: kg_json_file (Path): The path of the knowledge graph json file merge_freq (int): The frequency of the label, if it is lower than this, we will ignore it merge_similarity (float): The similarity threshold for the merge

Returns:

Source code in Docs2KG/kg/dynamic_schema.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self, kg_json_file: Path, merge_freq: int = 10, merge_similarity: float = 0.98
):
    """
    Initialize the dynamic schema class
    Args:
        kg_json_file (Path): The path of the knowledge graph json file
        merge_freq (int): The frequency of the label, if it is lower than this, we will ignore it
        merge_similarity (float): The similarity threshold for the merge

    Returns:

    """
    self.kg_json_file = kg_json_file
    self.kg_json = json.load(kg_json_file.open())
    self.merge_freq = merge_freq
    self.merge_similarity = merge_similarity
    self.nodes_freq = {}
    # use bert to calculate the similarity
    self.similarity_model = None
    self.tokenizer = None

human_in_the_loop_input()

Convert the schema into the dict { key: number of occurrence ... }

Then human will do the decision based on the value, to do the mapping

Source code in Docs2KG/kg/dynamic_schema.py
132
133
134
135
136
137
138
139
140
141
142
143
def human_in_the_loop_input(self) -> dict:
    """
    Convert the schema into the dict
    {
        key: number of occurrence
        ...
    }

    Then human will do the decision based on the value, to do the mapping
    """
    logger.info(f"Schema: {self.nodes_freq}")
    return self.nodes_freq

schema_extraction()

Extract the schema from the knowledge graph

Source code in Docs2KG/kg/dynamic_schema.py
57
58
59
60
61
62
63
64
65
66
67
def schema_extraction(self):
    """
    Extract the schema from the knowledge graph
    """
    nodes = self.kg_json["nodes"]

    node_df = pd.DataFrame(nodes)
    # extract the unique labels and its occurrence (labels is a list field)
    unique_labels = node_df["labels"].explode().value_counts()
    logger.info(f"Unique labels: {unique_labels}")
    self.nodes_freq = unique_labels.to_dict()

schema_freq_merge()

Replace the label under the threshold into text_block label

Returns:

Name Type Description
merge_mapping dict

The mapping of the merge, key is the original label, value is the new label

Source code in Docs2KG/kg/dynamic_schema.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def schema_freq_merge(self) -> dict:
    """
    Replace the label under the threshold into text_block label

    Returns:
        merge_mapping (dict): The mapping of the merge, key is the original label, value is the new label
    """

    # for the one with lower occurrence, we can ignore them
    merge_mapping = {}
    for key, value in self.nodes_freq.items():
        if key.lower() in HTML_TAGS:
            continue
        if value < self.merge_freq:
            merge_mapping[key] = "text_block"
    logger.debug(f"Merge mapping: {merge_mapping} based on frequency")
    return merge_mapping

schema_similarity_merge()

Merge the schema based on the similarity

Returns:

Name Type Description
merge_mapping dict

The mapping of the merge, key is the original label, value is the new label

Source code in Docs2KG/kg/dynamic_schema.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def schema_similarity_merge(self) -> dict:
    """
    Merge the schema based on the similarity

    Returns:
        merge_mapping (dict): The mapping of the merge, key is the original label, value is the new label
    """
    merge_mapping = {}
    # calculate the pairwise similarity for all the labels using the sentence transformer
    # for the one with high similarity, we can merge them
    # so, we should first construct a 2D matrix
    if self.similarity_model is None:
        # use bert to calculate the key similarity
        self.similarity_model = BertModel.from_pretrained("bert-base-uncased")
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def encode_label(label):
        inputs = self.tokenizer(label, return_tensors="pt")
        with torch.no_grad():
            outputs = self.similarity_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy().flatten()

    labels = list(self.nodes_freq.keys())
    logger.debug(labels)
    label_matrix = [encode_label([label]) for label in labels]
    label_matrix = np.array(label_matrix)

    for i in range(len(labels)):
        for j in range(len(labels)):
            if i == j:
                continue
            if (
                cosine_similarity(
                    label_matrix[i].reshape(1, -1), label_matrix[j].reshape(1, -1)
                )
                > self.merge_similarity
            ):
                if labels[i].lower() in HTML_TAGS or labels[j].lower() in HTML_TAGS:
                    continue
                merge_mapping[labels[i]] = labels[j]

    # then we can calculate the similarity
    logger.info(f"Merge mapping: {merge_mapping} based on similarity")
    return merge_mapping