Skip to content

Web2urls

Web2URLs

Bases: WebParserBase

Source code in Docs2KG/parser/web/web2urls.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class Web2URLs(WebParserBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.table_output_dir = self.output_dir / "urls"
        self.table_output_dir.mkdir(parents=True, exist_ok=True)

    def extract2tables(self):
        """
        Extract the HTML file to tables and save it to the output directory

        """

        with open(f"{self.output_dir}/index.html", "r") as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, "html.parser")
        # find all urls and save them to a csv file
        urls = []
        for a in soup.find_all("a"):
            urls.append(a.get("href"))
        df = pd.DataFrame(urls, columns=["URL"])
        csv_filename = f"{self.output_dir}/urls/urls.csv"
        df.to_csv(csv_filename, index=False)

extract2tables()

Extract the HTML file to tables and save it to the output directory

Source code in Docs2KG/parser/web/web2urls.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def extract2tables(self):
    """
    Extract the HTML file to tables and save it to the output directory

    """

    with open(f"{self.output_dir}/index.html", "r") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    # find all urls and save them to a csv file
    urls = []
    for a in soup.find_all("a"):
        urls.append(a.get("href"))
    df = pd.DataFrame(urls, columns=["URL"])
    csv_filename = f"{self.output_dir}/urls/urls.csv"
    df.to_csv(csv_filename, index=False)