Skip to content

Ner prompt based

NERLLMPromptExtractor

Bases: SemanticKGConstructionBase

Extract named entities using LLM and Entity Type List

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class NERLLMPromptExtractor(SemanticKGConstructionBase):
    """
    Extract named entities using LLM and Entity Type List
    """

    def __init__(
        self,
        project_id: str,
        agent_name="phi3.5",
        agent_type="ollama",
        **kwargs,
    ):
        """
        Initialize LLM NER Extractor

        Args:
            llm_entity_type_agent: Whether to use LLM for entity type judgement
        """
        super().__init__(
            project_id=project_id,
        )

        self.llm_ner_extract_agent = AgentManager(agent_name, agent_type, **kwargs)
        self.entity_type_list = []
        self.load_entity_type()

    def extract_entities(self, text: str) -> List[Dict[str, Any]]:
        """
        Extract entities from the given text, handling long texts by splitting into chunks

        Args:
            text: Text to extract entities from

        Returns:
            list: List of dictionaries containing entity information with format:
            {
                "id": str,          # Unique identifier for the entity
                "end": int,         # End position in text
                "start": int,       # Start position in text
                "text": str,        # Matched text
                "label": str,       # Entity type/label
                "confidence": float # Confidence score
            }
        """
        if not text or (len(self.entity_type_list) == 0):
            return []

        # Split text into chunks, preserving the periods
        text_chunks = [
            chunk.strip() + "." for chunk in text.split(".") if chunk.strip()
        ]

        # Process each chunk while tracking overall character position
        all_entities = []
        current_position = 0

        for chunk in text_chunks:
            # Create prompt for current chunk
            chunk_prompt = f"""
            Extract entities from the given text:
            {chunk.lower()}

            It should be one of the following entity types:
            {", ".join(self.entity_type_list)}

            Please output a list of entities in the following format via JSON:
            [
                    {{
                        "text": "entity text",
                        "label": "entity type",
                        "confidence": 1.0
                    }},
                    ...
                ]

            entity text is the matched text
            entity type is the label of the entity
            confidence is the confidence score of the entity, it should be within [0.0, 1.0]
            You should return it as an array of JSON objects.
            """

            try:
                # Process chunk
                res = self.llm_ner_extract_agent.process_input(
                    chunk_prompt, reset_session=True
                )
                res_json_str = res["response"].strip()
                # logger.info(f"LLM response for chunk: {res_json_str}")

                entities_json = json.loads(res_json_str)
                # if the json is a dict, convert it to a list
                if isinstance(entities_json, dict):
                    entities_json = [entities_json]

                # Verify entities for this chunk
                verified_chunk_entities = self.verify_output_entities(
                    chunk.lower(), entities_json
                )

                logger.info(
                    f"Verified entities for chunk: {len(verified_chunk_entities)}. \n{verified_chunk_entities}"
                )
                # Adjust start and end positions based on current position in overall text
                for entity in verified_chunk_entities:
                    entity["start"] += current_position
                    entity["end"] += current_position
                    entity["method"] = self.__class__.__name__

                all_entities.extend(verified_chunk_entities)

            except Exception as e:
                logger.error(f"Failed to extract entities from chunk: {str(e)}")
                logger.exception(e)
                continue

            # Update current position for next chunk
            current_position += len(chunk)

        logger.critical(
            f"All extracted and verified entities: {len(all_entities)}. \n{all_entities}"
        )
        return all_entities

    def verify_output_entities(
        self, text, entities: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """
        Verify the extracted entities, the start and end indices is correct

        Args:
            text: Text to extract entities from
            entities: List of extracted entities

        Returns:
            list: List of verified entities
        """
        verified_entities = []
        for entity in entities:
            if entity["label"] not in self.entity_type_list:
                logger.info(
                    f"Dropping entity: {entity} for entity type: {entity['label']}"
                )
                logger.warning(f"Entity type {entity['label']} not in entity type list")
                continue
            entity["start"], entity["end"] = self.locate_text_start_end(text, entity)
            if entity["start"] is None or entity["end"] is None:
                logger.error(f"Failed to locate entity: {entity}")
                continue
            entity["start"], entity["end"] = int(entity["start"]), int(entity["end"])
            # add a unique id for the entity
            entity["id"] = (
                f"ner-llm-{hash(entity['text'] + str(entity['start']) + str(entity['end']) + entity['label'])}"
            )
            verified_entities.append(entity)
        return verified_entities

    @staticmethod
    def verify_entity_position(start, end, text, entity):
        """
        Verify the entity position in the text

        Args:
            start: Start index of the entity
            end: End index of the entity
            text: Text to extract entities from
            entity: Extracted entity

        Returns:
            bool: True if the entity position is correct, False otherwise
        """
        try:
            return text[start:end] == entity["text"]
        except Exception as e:
            logger.error(f"Failed to verify entity position: {str(e)}")
            return False

    @staticmethod
    def locate_text_start_end(text, entity):
        """
        Locate the start and end index of the entity in the text

        Args:
            text: Text to extract entities from
            entity: Extracted entity

        Returns:
            tuple: Start and end index of the entity
        """
        try:
            start = text.find(entity["text"])
            # if start == -1, which means the entity is not found in the text
            if start == -1:
                return None, None
            end = start + len(entity["text"])
            return start, end
        except Exception as e:
            logger.error(f"Failed to locate entity in text: {str(e)}")
            return None, None

    def construct_kg(self, input_data: List[Path]) -> None:
        """
        Construct a semantic knowledge graph from input data.

        Args:
            input_data: Input data to construct the knowledge graph
        """
        logger.info(
            f"Extracting entities from {len(input_data)} layout knowledge graphs"
        )
        for layout_kg_path in input_data:
            if not layout_kg_path.exists():
                logger.error(f"Layout knowledge graph not found at {layout_kg_path}")
                continue
            layout_kg = self.load_layout_kg(layout_kg_path)

            if "data" not in layout_kg:
                logger.error(f"Document data not found in {layout_kg_path}")
                continue

            for item in layout_kg["data"]:
                if "text" not in item:
                    logger.error(f"Text not found in document item: {item}")
                    continue
                text = item["text"]
                entities = self.extract_entities(text)
                # extend the extracted entities to the layout knowledge graph
                item["entities"].extend(entities)
                # remove duplicated entities based on start and end positions, text and label
                item["entities"] = self.unique_entities(item["entities"])

            self.update_layout_kg(layout_kg_path, layout_kg)

__init__(project_id, agent_name='phi3.5', agent_type='ollama', **kwargs)

Initialize LLM NER Extractor

Parameters:

Name Type Description Default
llm_entity_type_agent

Whether to use LLM for entity type judgement

required
Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(
    self,
    project_id: str,
    agent_name="phi3.5",
    agent_type="ollama",
    **kwargs,
):
    """
    Initialize LLM NER Extractor

    Args:
        llm_entity_type_agent: Whether to use LLM for entity type judgement
    """
    super().__init__(
        project_id=project_id,
    )

    self.llm_ner_extract_agent = AgentManager(agent_name, agent_type, **kwargs)
    self.entity_type_list = []
    self.load_entity_type()

construct_kg(input_data)

Construct a semantic knowledge graph from input data.

Parameters:

Name Type Description Default
input_data List[Path]

Input data to construct the knowledge graph

required
Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def construct_kg(self, input_data: List[Path]) -> None:
    """
    Construct a semantic knowledge graph from input data.

    Args:
        input_data: Input data to construct the knowledge graph
    """
    logger.info(
        f"Extracting entities from {len(input_data)} layout knowledge graphs"
    )
    for layout_kg_path in input_data:
        if not layout_kg_path.exists():
            logger.error(f"Layout knowledge graph not found at {layout_kg_path}")
            continue
        layout_kg = self.load_layout_kg(layout_kg_path)

        if "data" not in layout_kg:
            logger.error(f"Document data not found in {layout_kg_path}")
            continue

        for item in layout_kg["data"]:
            if "text" not in item:
                logger.error(f"Text not found in document item: {item}")
                continue
            text = item["text"]
            entities = self.extract_entities(text)
            # extend the extracted entities to the layout knowledge graph
            item["entities"].extend(entities)
            # remove duplicated entities based on start and end positions, text and label
            item["entities"] = self.unique_entities(item["entities"])

        self.update_layout_kg(layout_kg_path, layout_kg)

extract_entities(text)

Extract entities from the given text, handling long texts by splitting into chunks

Parameters:

Name Type Description Default
text str

Text to extract entities from

required

Returns:

Name Type Description
list List[Dict[str, Any]]

List of dictionaries containing entity information with format:

List[Dict[str, Any]]

{ "id": str, # Unique identifier for the entity "end": int, # End position in text "start": int, # Start position in text "text": str, # Matched text "label": str, # Entity type/label "confidence": float # Confidence score

List[Dict[str, Any]]

}

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def extract_entities(self, text: str) -> List[Dict[str, Any]]:
    """
    Extract entities from the given text, handling long texts by splitting into chunks

    Args:
        text: Text to extract entities from

    Returns:
        list: List of dictionaries containing entity information with format:
        {
            "id": str,          # Unique identifier for the entity
            "end": int,         # End position in text
            "start": int,       # Start position in text
            "text": str,        # Matched text
            "label": str,       # Entity type/label
            "confidence": float # Confidence score
        }
    """
    if not text or (len(self.entity_type_list) == 0):
        return []

    # Split text into chunks, preserving the periods
    text_chunks = [
        chunk.strip() + "." for chunk in text.split(".") if chunk.strip()
    ]

    # Process each chunk while tracking overall character position
    all_entities = []
    current_position = 0

    for chunk in text_chunks:
        # Create prompt for current chunk
        chunk_prompt = f"""
        Extract entities from the given text:
        {chunk.lower()}

        It should be one of the following entity types:
        {", ".join(self.entity_type_list)}

        Please output a list of entities in the following format via JSON:
        [
                {{
                    "text": "entity text",
                    "label": "entity type",
                    "confidence": 1.0
                }},
                ...
            ]

        entity text is the matched text
        entity type is the label of the entity
        confidence is the confidence score of the entity, it should be within [0.0, 1.0]
        You should return it as an array of JSON objects.
        """

        try:
            # Process chunk
            res = self.llm_ner_extract_agent.process_input(
                chunk_prompt, reset_session=True
            )
            res_json_str = res["response"].strip()
            # logger.info(f"LLM response for chunk: {res_json_str}")

            entities_json = json.loads(res_json_str)
            # if the json is a dict, convert it to a list
            if isinstance(entities_json, dict):
                entities_json = [entities_json]

            # Verify entities for this chunk
            verified_chunk_entities = self.verify_output_entities(
                chunk.lower(), entities_json
            )

            logger.info(
                f"Verified entities for chunk: {len(verified_chunk_entities)}. \n{verified_chunk_entities}"
            )
            # Adjust start and end positions based on current position in overall text
            for entity in verified_chunk_entities:
                entity["start"] += current_position
                entity["end"] += current_position
                entity["method"] = self.__class__.__name__

            all_entities.extend(verified_chunk_entities)

        except Exception as e:
            logger.error(f"Failed to extract entities from chunk: {str(e)}")
            logger.exception(e)
            continue

        # Update current position for next chunk
        current_position += len(chunk)

    logger.critical(
        f"All extracted and verified entities: {len(all_entities)}. \n{all_entities}"
    )
    return all_entities

locate_text_start_end(text, entity) staticmethod

Locate the start and end index of the entity in the text

Parameters:

Name Type Description Default
text

Text to extract entities from

required
entity

Extracted entity

required

Returns:

Name Type Description
tuple

Start and end index of the entity

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
@staticmethod
def locate_text_start_end(text, entity):
    """
    Locate the start and end index of the entity in the text

    Args:
        text: Text to extract entities from
        entity: Extracted entity

    Returns:
        tuple: Start and end index of the entity
    """
    try:
        start = text.find(entity["text"])
        # if start == -1, which means the entity is not found in the text
        if start == -1:
            return None, None
        end = start + len(entity["text"])
        return start, end
    except Exception as e:
        logger.error(f"Failed to locate entity in text: {str(e)}")
        return None, None

verify_entity_position(start, end, text, entity) staticmethod

Verify the entity position in the text

Parameters:

Name Type Description Default
start

Start index of the entity

required
end

End index of the entity

required
text

Text to extract entities from

required
entity

Extracted entity

required

Returns:

Name Type Description
bool

True if the entity position is correct, False otherwise

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
@staticmethod
def verify_entity_position(start, end, text, entity):
    """
    Verify the entity position in the text

    Args:
        start: Start index of the entity
        end: End index of the entity
        text: Text to extract entities from
        entity: Extracted entity

    Returns:
        bool: True if the entity position is correct, False otherwise
    """
    try:
        return text[start:end] == entity["text"]
    except Exception as e:
        logger.error(f"Failed to verify entity position: {str(e)}")
        return False

verify_output_entities(text, entities)

Verify the extracted entities, the start and end indices is correct

Parameters:

Name Type Description Default
text

Text to extract entities from

required
entities List[Dict[str, Any]]

List of extracted entities

required

Returns:

Name Type Description
list List[Dict[str, Any]]

List of verified entities

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_prompt_based.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def verify_output_entities(
    self, text, entities: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
    """
    Verify the extracted entities, the start and end indices is correct

    Args:
        text: Text to extract entities from
        entities: List of extracted entities

    Returns:
        list: List of verified entities
    """
    verified_entities = []
    for entity in entities:
        if entity["label"] not in self.entity_type_list:
            logger.info(
                f"Dropping entity: {entity} for entity type: {entity['label']}"
            )
            logger.warning(f"Entity type {entity['label']} not in entity type list")
            continue
        entity["start"], entity["end"] = self.locate_text_start_end(text, entity)
        if entity["start"] is None or entity["end"] is None:
            logger.error(f"Failed to locate entity: {entity}")
            continue
        entity["start"], entity["end"] = int(entity["start"]), int(entity["end"])
        # add a unique id for the entity
        entity["id"] = (
            f"ner-llm-{hash(entity['text'] + str(entity['start']) + str(entity['end']) + entity['label'])}"
        )
        verified_entities.append(entity)
    return verified_entities