Skip to content

Email layout kg

logger = get_logger(__name__) module-attribute

TODO:

  • Try to extract the image and file captions

EmailLayoutKG

Source code in Docs2KG/kg/email_layout_kg.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
class EmailLayoutKG:
    def __init__(self, output_dir: Path = None) -> None:
        """
        Initialize the WebParserBase class

        Args:
            output_dir (Path): Path to the output directory where the converted files will be saved

        """

        self.output_dir = output_dir

        self.kg_json = {}
        self.kg_folder = self.output_dir / "kg"
        self.kg_folder.mkdir(parents=True, exist_ok=True)

        # image and table output directories
        self.image_output_dir = self.output_dir / "images"
        self.image_output_dir.mkdir(parents=True, exist_ok=True)
        # attachment output directories
        self.attachment_output_dir = self.output_dir / "attachments"
        self.attachment_output_dir.mkdir(parents=True, exist_ok=True)

        self.images_df = pd.read_csv(f"{self.image_output_dir}/images.csv")
        self.attachments_df = pd.read_csv(
            f"{self.attachment_output_dir}/attachments.csv"
        )

    def create_kg(self):
        """
        Create the knowledge graph from the HTML file

        """
        with open(f"{self.output_dir}/email.html", "r") as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, "html.parser")
        """
        Loop and extract the whole soup into a tree
        Each node will have

        ```
        {
            "uuid": str,
            "node_type": str,
            "node_properties": {
                "content": str,
                // all other stuff
            },
            "children": List[Node]
        }
        ```
        """
        self.kg_json = self.extract_kg(soup)
        self.export_kg()

    def extract_kg(self, soup):
        """
        Extract the knowledge graph from the HTML file

        Args:
            soup (BeautifulSoup): Parsed HTML content

        Returns:
            dict: Knowledge graph in JSON format

        """
        node = {
            "uuid": str(uuid4()),
            "children": [],
        }

        for child in soup.children:
            if child.name is not None and soup.name != "table":
                child_node = self.extract_kg(child)
                node["children"].append(child_node)
        # content should be text if exists, if not, leave ""
        content = str(soup.text) if soup.text is not None else ""
        content = content.strip()
        logger.info(content)
        logger.info(soup.name)
        # if there is no parent, then it is the root node, which we call it document
        node_type = str(soup.name) if soup.name is not None else "text"
        if "document" in node_type:
            node_type = "email"
        node["node_type"] = node_type
        soup_attr = soup.attrs
        copied_soup = deepcopy(soup_attr)
        for key in copied_soup.keys():
            if "-" in key:
                soup_attr[key.replace("-", "_")] = copied_soup[key]
                del soup_attr[key]
            if "$" in key or ":" in key:
                del soup_attr[key]
        node["node_properties"] = {"content": content, **soup_attr}
        # if it is an image tag, then extract the image and save it to the output directory
        if soup.name == "img":
            img_url = soup.get("src")

            if img_url.startswith("cid:"):
                image_cid = img_url.split(":")[1]
                logger.info(image_cid)
                image_file_path = self.images_df[
                    self.images_df["cid"] == f"<{image_cid}>"
                ]["path"].values[0]
                logger.info(image_file_path)
                node["node_properties"]["img_path"] = image_file_path
            else:
                img_data = requests.get(img_url).content
                img_name = img_url.split("/")[-1]
                logger.info("image_url")
                logger.info(img_url)
                if "?" in img_name:
                    img_name = img_name.split("?")[0]
                with open(f"{self.output_dir}/images/{img_name}", "wb") as f:
                    f.write(img_data)
                logger.info(f"Extracted the HTML file from {img_url} to images")
                node["node_properties"][
                    "img_path"
                ] = f"{self.output_dir}/images/{img_name}"
        # if it is a table tag, then extract the table and save it to the output directory
        if soup.name == "table":
            rows = []
            for row in soup.find_all("tr"):
                cells = [
                    cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
                ]
                rows.append(cells)
            df = pd.DataFrame(rows[1:], columns=rows[0])  # Assuming first row is header
            csv_filename = f"{self.output_dir}/tables/{node['uuid']}.csv"
            df.to_csv(csv_filename, index=False)
            logger.info("Extracted the HTML file from to tables")
            node["node_properties"]["table_path"] = csv_filename
        # remove the node from soup after extracting the image and table
        soup.extract()

        if node_type == "email":
            # also add the metadata to the node properties
            with open(f"{self.output_dir}/metadata.json", "r") as f:
                metadata = json.load(f)
            node["node_properties"] = {**node["node_properties"], **metadata}

            # add all the attachments to children
            for _, attachment in self.attachments_df.iterrows():
                attachment_node = {
                    "uuid": str(uuid4()),
                    "node_type": "attachment",
                    "node_properties": attachment.to_dict(),
                    "children": [],
                }
                node["children"].append(attachment_node)
        return node

    def export_kg(self) -> None:
        """
        Export the knowledge graph to json file
        """
        with open(self.kg_folder / "layout_kg.json", "w") as f:
            json.dump(self.kg_json, f, indent=2)

__init__(output_dir=None)

Initialize the WebParserBase class

Parameters:

Name Type Description Default
output_dir Path

Path to the output directory where the converted files will be saved

None
Source code in Docs2KG/kg/email_layout_kg.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(self, output_dir: Path = None) -> None:
    """
    Initialize the WebParserBase class

    Args:
        output_dir (Path): Path to the output directory where the converted files will be saved

    """

    self.output_dir = output_dir

    self.kg_json = {}
    self.kg_folder = self.output_dir / "kg"
    self.kg_folder.mkdir(parents=True, exist_ok=True)

    # image and table output directories
    self.image_output_dir = self.output_dir / "images"
    self.image_output_dir.mkdir(parents=True, exist_ok=True)
    # attachment output directories
    self.attachment_output_dir = self.output_dir / "attachments"
    self.attachment_output_dir.mkdir(parents=True, exist_ok=True)

    self.images_df = pd.read_csv(f"{self.image_output_dir}/images.csv")
    self.attachments_df = pd.read_csv(
        f"{self.attachment_output_dir}/attachments.csv"
    )

create_kg()

Create the knowledge graph from the HTML file

Source code in Docs2KG/kg/email_layout_kg.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def create_kg(self):
    """
    Create the knowledge graph from the HTML file

    """
    with open(f"{self.output_dir}/email.html", "r") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    """
    Loop and extract the whole soup into a tree
    Each node will have

    ```
    {
        "uuid": str,
        "node_type": str,
        "node_properties": {
            "content": str,
            // all other stuff
        },
        "children": List[Node]
    }
    ```
    """
    self.kg_json = self.extract_kg(soup)
    self.export_kg()

export_kg()

Export the knowledge graph to json file

Source code in Docs2KG/kg/email_layout_kg.py
174
175
176
177
178
179
def export_kg(self) -> None:
    """
    Export the knowledge graph to json file
    """
    with open(self.kg_folder / "layout_kg.json", "w") as f:
        json.dump(self.kg_json, f, indent=2)

extract_kg(soup)

Extract the knowledge graph from the HTML file

Parameters:

Name Type Description Default
soup BeautifulSoup

Parsed HTML content

required

Returns:

Name Type Description
dict

Knowledge graph in JSON format

Source code in Docs2KG/kg/email_layout_kg.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def extract_kg(self, soup):
    """
    Extract the knowledge graph from the HTML file

    Args:
        soup (BeautifulSoup): Parsed HTML content

    Returns:
        dict: Knowledge graph in JSON format

    """
    node = {
        "uuid": str(uuid4()),
        "children": [],
    }

    for child in soup.children:
        if child.name is not None and soup.name != "table":
            child_node = self.extract_kg(child)
            node["children"].append(child_node)
    # content should be text if exists, if not, leave ""
    content = str(soup.text) if soup.text is not None else ""
    content = content.strip()
    logger.info(content)
    logger.info(soup.name)
    # if there is no parent, then it is the root node, which we call it document
    node_type = str(soup.name) if soup.name is not None else "text"
    if "document" in node_type:
        node_type = "email"
    node["node_type"] = node_type
    soup_attr = soup.attrs
    copied_soup = deepcopy(soup_attr)
    for key in copied_soup.keys():
        if "-" in key:
            soup_attr[key.replace("-", "_")] = copied_soup[key]
            del soup_attr[key]
        if "$" in key or ":" in key:
            del soup_attr[key]
    node["node_properties"] = {"content": content, **soup_attr}
    # if it is an image tag, then extract the image and save it to the output directory
    if soup.name == "img":
        img_url = soup.get("src")

        if img_url.startswith("cid:"):
            image_cid = img_url.split(":")[1]
            logger.info(image_cid)
            image_file_path = self.images_df[
                self.images_df["cid"] == f"<{image_cid}>"
            ]["path"].values[0]
            logger.info(image_file_path)
            node["node_properties"]["img_path"] = image_file_path
        else:
            img_data = requests.get(img_url).content
            img_name = img_url.split("/")[-1]
            logger.info("image_url")
            logger.info(img_url)
            if "?" in img_name:
                img_name = img_name.split("?")[0]
            with open(f"{self.output_dir}/images/{img_name}", "wb") as f:
                f.write(img_data)
            logger.info(f"Extracted the HTML file from {img_url} to images")
            node["node_properties"][
                "img_path"
            ] = f"{self.output_dir}/images/{img_name}"
    # if it is a table tag, then extract the table and save it to the output directory
    if soup.name == "table":
        rows = []
        for row in soup.find_all("tr"):
            cells = [
                cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
            ]
            rows.append(cells)
        df = pd.DataFrame(rows[1:], columns=rows[0])  # Assuming first row is header
        csv_filename = f"{self.output_dir}/tables/{node['uuid']}.csv"
        df.to_csv(csv_filename, index=False)
        logger.info("Extracted the HTML file from to tables")
        node["node_properties"]["table_path"] = csv_filename
    # remove the node from soup after extracting the image and table
    soup.extract()

    if node_type == "email":
        # also add the metadata to the node properties
        with open(f"{self.output_dir}/metadata.json", "r") as f:
            metadata = json.load(f)
        node["node_properties"] = {**node["node_properties"], **metadata}

        # add all the attachments to children
        for _, attachment in self.attachments_df.iterrows():
            attachment_node = {
                "uuid": str(uuid4()),
                "node_type": "attachment",
                "node_properties": attachment.to_dict(),
                "children": [],
            }
            node["children"].append(attachment_node)
    return node