Skip to content

Metadata kg

MetadataKGConstruction

Bases: KGConstructionBase

The input should be a csv file with the following columns: - name: the name of the document - other columns: metadata fields, what we will do is to extract all unique values in each column and create a node for each value - and then create a relationship between the document and the metadata value - for columns is continuous, we will ignore it and put them as the property of the document node

Source code in Docs2KG/kg_construction/metadata_kg/metadata_kg.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class MetadataKGConstruction(KGConstructionBase):
    """
    The input should be a csv file with the following columns:
    - name: the name of the document
    - other columns: metadata fields, what we will do is to extract all unique values in each column and create a node for each value
    - and then create a relationship between the document and the metadata value
    - for columns is continuous, we will ignore it and put them as the property of the document node
    """

    def __init__(self, project_id: str):

        super().__init__(project_id)
        self.document_id_column = "name"
        self.continuous_columns = []
        self.categorical_columns = []

    @staticmethod
    def _is_continuous(series: pd.Series, threshold: float = 0.5) -> bool:
        """
        Determine if a column should be treated as continuous based on unique value ratio

        Args:
            series: pandas Series to check
            threshold: ratio threshold to determine if continuous

        Returns:
            bool: True if the column should be treated as continuous
        """
        unique_ratio = len(series.unique()) / len(series)
        return unique_ratio > threshold and pd.api.types.is_numeric_dtype(series)

    def _identify_column_types(self, df: pd.DataFrame) -> None:
        """
        Identify continuous and categorical columns in the dataframe

        Args:
            df: input dataframe
        """
        for column in df.columns:
            if column == self.document_id_column:
                continue
            if self._is_continuous(df[column]):
                self.continuous_columns.append(column)
            else:
                self.categorical_columns.append(column)

    def _create_document_nodes(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """
        Create document nodes with continuous properties

        Args:
            df: input dataframe

        Returns:
            List of document nodes with properties
        """
        document_nodes = []
        for _, row in df.iterrows():
            properties = {
                col: row[col] for col in self.continuous_columns if pd.notna(row[col])
            }
            node = {
                "id": f"doc_{row[self.document_id_column]}",
                "type": "Document",
                "properties": {
                    self.document_id_column: row[self.document_id_column],
                    **properties,
                },
            }
            document_nodes.append(node)
        return document_nodes

    def _create_metadata_nodes(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """
        Create nodes for unique categorical metadata values

        Args:
            df: input dataframe

        Returns:
            List of metadata value nodes
        """
        metadata_nodes = []
        for column in self.categorical_columns:
            unique_values = df[column].dropna().unique()
            for value in unique_values:
                node = {
                    "id": f"{column}_{value}",
                    "type": column,
                    "properties": {"value": value},
                }
                metadata_nodes.append(node)
        return metadata_nodes

    def _create_relationships(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """
        Create relationships between documents and their metadata values

        Args:
            df: input dataframe

        Returns:
            List of relationships
        """
        relationships = []
        for _, row in df.iterrows():
            doc_id = f"doc_{row[self.document_id_column]}"
            for column in self.categorical_columns:
                if pd.notna(row[column]):
                    relationship = {
                        "source": doc_id,
                        "target": f"{column}_{row[column]}",
                        "type": f"HAS_{column.upper()}",
                    }
                    relationships.append(relationship)
        return relationships

    def construct(
        self, docs: Union[str, pd.DataFrame], document_id_column: str = "name"
    ) -> Dict[str, List[Dict[str, Any]]]:
        """
        Construct knowledge graph from document metadata

        Args:
            docs: Either path to CSV file or pandas DataFrame containing document metadata
            document_id_column: Name of the column containing document IDs

        Returns:
            Dictionary containing nodes and relationships for the knowledge graph
        """
        # Load data if string path provided
        if isinstance(docs, str) or isinstance(docs, Path):
            df = pd.read_csv(docs)
        else:
            df = docs.copy()

        # remove unamed columns
        df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
        # Validate required columns
        if document_id_column not in df.columns:
            raise ValueError(f"Input data must contain '{document_id_column}' column")
        self.document_id_column = document_id_column
        # Identify column types
        self._identify_column_types(df)

        # Create nodes and relationships
        document_nodes = self._create_document_nodes(df)
        metadata_nodes = self._create_metadata_nodes(df)
        relationships = self._create_relationships(df)

        metadata_kg = {
            "nodes": document_nodes + metadata_nodes,
            "relationships": relationships,
        }

        # export to metadata_kg.json
        self.export_json(metadata_kg, "metadata_kg.json")

        return metadata_kg

construct(docs, document_id_column='name')

Construct knowledge graph from document metadata

Parameters:

Name Type Description Default
docs Union[str, DataFrame]

Either path to CSV file or pandas DataFrame containing document metadata

required
document_id_column str

Name of the column containing document IDs

'name'

Returns:

Type Description
Dict[str, List[Dict[str, Any]]]

Dictionary containing nodes and relationships for the knowledge graph

Source code in Docs2KG/kg_construction/metadata_kg/metadata_kg.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def construct(
    self, docs: Union[str, pd.DataFrame], document_id_column: str = "name"
) -> Dict[str, List[Dict[str, Any]]]:
    """
    Construct knowledge graph from document metadata

    Args:
        docs: Either path to CSV file or pandas DataFrame containing document metadata
        document_id_column: Name of the column containing document IDs

    Returns:
        Dictionary containing nodes and relationships for the knowledge graph
    """
    # Load data if string path provided
    if isinstance(docs, str) or isinstance(docs, Path):
        df = pd.read_csv(docs)
    else:
        df = docs.copy()

    # remove unamed columns
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    # Validate required columns
    if document_id_column not in df.columns:
        raise ValueError(f"Input data must contain '{document_id_column}' column")
    self.document_id_column = document_id_column
    # Identify column types
    self._identify_column_types(df)

    # Create nodes and relationships
    document_nodes = self._create_document_nodes(df)
    metadata_nodes = self._create_metadata_nodes(df)
    relationships = self._create_relationships(df)

    metadata_kg = {
        "nodes": document_nodes + metadata_nodes,
        "relationships": relationships,
    }

    # export to metadata_kg.json
    self.export_json(metadata_kg, "metadata_kg.json")

    return metadata_kg