Skip to content

Ner spacy match

NERSpacyMatcher

Bases: SemanticKGConstructionBase

To get this working, need to run: python -m spacy download en_core_web_sm first

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_spacy_match.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
class NERSpacyMatcher(SemanticKGConstructionBase):
    """
    To get this working, need to run: python -m spacy download en_core_web_sm first

    """

    def __init__(
        self,
        project_id: str,
        agent_name: str = "phi3.5",
        agent_type: str = "ollama",
    ):
        super().__init__(project_id)
        # Load SpaCy model (use a smaller model for speed)
        self.nlp = spacy.load("en_core_web_sm")
        self.matcher = Matcher(self.nlp.vocab)
        self.entity_dict = {}
        self.load_entity_list()
        self.llm_judgement_agent = NERLLMJudge(agent_name, agent_type)

    def load_entity_list(self):
        try:
            entity_list_path = Path(PROJECT_CONFIG.semantic_kg.entity_list)
            if not entity_list_path.exists():
                raise FileNotFoundError(f"Entity list not found at {entity_list_path}")
            with timer(logger, "Loading entity list"):
                df = pd.read_csv(entity_list_path, sep=r",(?=[^,]*$)", engine="python")
            self.entity_dict = dict(zip(df["entity"], df["entity_type"]))
            self._initialize_patterns()

        except Exception as e:
            logger.error(f"Error loading entity list: {e}")
            return

    def _initialize_patterns(self):
        """
        Convert entity dictionary to SpaCy patterns
        Handles both single-word and multi-word entities
        """
        patterns = []

        for entity_text, entity_type in self.entity_dict.items():
            # Convert entity text to lowercase
            entity_lower = entity_text.lower()

            # Split entity text into tokens
            tokens = entity_lower.split()

            # Create pattern for exact matching
            pattern = [{"LOWER": token} for token in tokens]

            # Add pattern to matcher with unique ID
            pattern_id = f"{entity_type}_{hash(entity_lower)}"
            self.matcher.add(pattern_id, [pattern])

            # Store mapping of pattern_id to original entity text and type
            patterns.append(
                {
                    "id": pattern_id,
                    "text": entity_text,
                    "type": entity_type,
                    "pattern": pattern,
                }
            )

        self.patterns = patterns

    def extract_entities(self, text: str) -> List[Dict[str, Any]]:
        """
        Given the text, find the case-insensitive match of the entities in the entity dict.
        Uses SpaCy Matcher to find the entities in the text first.

        Args:
            text (str): The input text to search for entities

        Returns:
            list: List of dictionaries containing entity information with format:
            {
                "id": str,          # Unique identifier for the entity
                "end": int,         # End position in text
                "start": int,       # Start position in text
                "text": str,        # Matched text
                "label": str,       # Entity type/label
                "confidence": float # Confidence score
            }
        """
        if not text or not self.entity_dict:
            return []
        text = text.lower()
        # Process text with SpaCy
        doc = self.nlp(text)

        # Find matches using the matcher
        matches = self.matcher(doc)

        # Convert matches to our format
        results = []
        for match_id, start, end in matches:
            # Get the matched span
            span = doc[start:end]

            # Get the original text from the span
            matched_text = span.text

            # Find corresponding pattern info
            pattern_id = self.nlp.vocab.strings[match_id]
            entity_info = next(
                (p for p in self.patterns if p["id"] == pattern_id), None
            )

            if entity_info:
                # Create match entry
                if not self._validate_match(doc, start, end):
                    continue

                is_correct = self.llm_judgement_agent.judge(
                    ner=matched_text, ner_type=entity_info["type"], text=text
                )
                if not is_correct:
                    continue

                match = {
                    "id": f"ner-spacy-{hash(matched_text + str(start) + str(end))}-{str(uuid4())}",
                    "start": span.start_char,
                    "end": span.end_char,
                    "text": matched_text,
                    "label": entity_info["type"],
                    "confidence": (
                        0.95
                        if matched_text.lower() == entity_info["text"].lower()
                        else 0.9
                    ),
                    "method": self.__class__.__name__,
                }
                results.append(match)

        # Sort results by start position
        results.sort(key=lambda x: x["start"])

        logger.info(f"Extracted entities: {results} for text: {text}")
        return results

    @staticmethod
    def _validate_match(doc, start, end):
        """
        Validate if a match is at proper word boundaries

        Args:
            doc: SpaCy Doc object
            start: Start token index
            end: End token index

        Returns:
            bool: Whether the match is valid
        """

        # Check if the span is at token boundaries
        if start > 0 and doc[start - 1].is_alpha:
            return False
        if end < len(doc) and doc[end].is_alpha:
            return False

        return True

    def construct_kg(self, input_data: List[Path]) -> None:
        """
        Construct a semantic knowledge graph from input data.

        Args:
            input_data: Input data to construct the knowledge graph
        """
        # Process each document
        for doc in tqdm(input_data, desc="Processing documents"):
            # Extract entities from the document text
            if not doc.exists():
                logger.error(f"Document not found at {doc}")
                continue
            logger.info(f"Processing document: {doc}")
            layout_kg = self.load_layout_kg(doc)
            if "data" not in layout_kg:
                logger.error(f"Document data not found in {doc}")
                continue
            for item in layout_kg["data"]:
                if "text" not in item:
                    logger.error(f"Text not found in document item: {item}")
                    continue
                text = item["text"]
                entities = self.extract_entities(text)
                # expand the item entities list with the extracted entities
                item["entities"].extend(entities)
                # then remove duplicated entities based on start and end positions, text and label
                item["entities"] = self.unique_entities(item["entities"])

            self.update_layout_kg(doc, layout_kg)

construct_kg(input_data)

Construct a semantic knowledge graph from input data.

Parameters:

Name Type Description Default
input_data List[Path]

Input data to construct the knowledge graph

required
Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_spacy_match.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def construct_kg(self, input_data: List[Path]) -> None:
    """
    Construct a semantic knowledge graph from input data.

    Args:
        input_data: Input data to construct the knowledge graph
    """
    # Process each document
    for doc in tqdm(input_data, desc="Processing documents"):
        # Extract entities from the document text
        if not doc.exists():
            logger.error(f"Document not found at {doc}")
            continue
        logger.info(f"Processing document: {doc}")
        layout_kg = self.load_layout_kg(doc)
        if "data" not in layout_kg:
            logger.error(f"Document data not found in {doc}")
            continue
        for item in layout_kg["data"]:
            if "text" not in item:
                logger.error(f"Text not found in document item: {item}")
                continue
            text = item["text"]
            entities = self.extract_entities(text)
            # expand the item entities list with the extracted entities
            item["entities"].extend(entities)
            # then remove duplicated entities based on start and end positions, text and label
            item["entities"] = self.unique_entities(item["entities"])

        self.update_layout_kg(doc, layout_kg)

extract_entities(text)

Given the text, find the case-insensitive match of the entities in the entity dict. Uses SpaCy Matcher to find the entities in the text first.

Parameters:

Name Type Description Default
text str

The input text to search for entities

required

Returns:

Name Type Description
list List[Dict[str, Any]]

List of dictionaries containing entity information with format:

List[Dict[str, Any]]

{ "id": str, # Unique identifier for the entity "end": int, # End position in text "start": int, # Start position in text "text": str, # Matched text "label": str, # Entity type/label "confidence": float # Confidence score

List[Dict[str, Any]]

}

Source code in Docs2KG/kg_construction/semantic_kg/ner/ner_spacy_match.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def extract_entities(self, text: str) -> List[Dict[str, Any]]:
    """
    Given the text, find the case-insensitive match of the entities in the entity dict.
    Uses SpaCy Matcher to find the entities in the text first.

    Args:
        text (str): The input text to search for entities

    Returns:
        list: List of dictionaries containing entity information with format:
        {
            "id": str,          # Unique identifier for the entity
            "end": int,         # End position in text
            "start": int,       # Start position in text
            "text": str,        # Matched text
            "label": str,       # Entity type/label
            "confidence": float # Confidence score
        }
    """
    if not text or not self.entity_dict:
        return []
    text = text.lower()
    # Process text with SpaCy
    doc = self.nlp(text)

    # Find matches using the matcher
    matches = self.matcher(doc)

    # Convert matches to our format
    results = []
    for match_id, start, end in matches:
        # Get the matched span
        span = doc[start:end]

        # Get the original text from the span
        matched_text = span.text

        # Find corresponding pattern info
        pattern_id = self.nlp.vocab.strings[match_id]
        entity_info = next(
            (p for p in self.patterns if p["id"] == pattern_id), None
        )

        if entity_info:
            # Create match entry
            if not self._validate_match(doc, start, end):
                continue

            is_correct = self.llm_judgement_agent.judge(
                ner=matched_text, ner_type=entity_info["type"], text=text
            )
            if not is_correct:
                continue

            match = {
                "id": f"ner-spacy-{hash(matched_text + str(start) + str(end))}-{str(uuid4())}",
                "start": span.start_char,
                "end": span.end_char,
                "text": matched_text,
                "label": entity_info["type"],
                "confidence": (
                    0.95
                    if matched_text.lower() == entity_info["text"].lower()
                    else 0.9
                ),
                "method": self.__class__.__name__,
            }
            results.append(match)

    # Sort results by start position
    results.sort(key=lambda x: x["start"])

    logger.info(f"Extracted entities: {results} for text: {text}")
    return results