Skip to content

Web2tables

Web2Tables

Bases: WebParserBase

Source code in Docs2KG/parser/web/web2tables.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class Web2Tables(WebParserBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.table_output_dir = self.output_dir / "tables"
        self.table_output_dir.mkdir(parents=True, exist_ok=True)

    def extract2tables(self):
        """
        Extract the HTML file to tables and save it to the output directory

        """

        with open(f"{self.output_dir}/index.html", "r") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")
        for i, table in enumerate(soup.find_all("table")):
            rows = []
            for row in table.find_all("tr"):
                cells = [
                    cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
                ]
                rows.append(cells)
            df = pd.DataFrame(rows[1:], columns=rows[0])  # Assuming first row is header
            csv_filename = f"{self.output_dir}/tables/{i}.csv"
            df.to_csv(csv_filename, index=False)
            logger.info(f"Extracted the HTML file from {self.url} to tables")

extract2tables()

Extract the HTML file to tables and save it to the output directory

Source code in Docs2KG/parser/web/web2tables.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def extract2tables(self):
    """
    Extract the HTML file to tables and save it to the output directory

    """

    with open(f"{self.output_dir}/index.html", "r") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    for i, table in enumerate(soup.find_all("table")):
        rows = []
        for row in table.find_all("tr"):
            cells = [
                cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
            ]
            rows.append(cells)
        df = pd.DataFrame(rows[1:], columns=rows[0])  # Assuming first row is header
        csv_filename = f"{self.output_dir}/tables/{i}.csv"
        df.to_csv(csv_filename, index=False)
        logger.info(f"Extracted the HTML file from {self.url} to tables")