Skip to content

Web2images

Web2Images

Bases: WebParserBase

Source code in Docs2KG/parser/web/web2images.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class Web2Images(WebParserBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.image_output_dir = self.output_dir / "images"
        self.image_output_dir.mkdir(parents=True, exist_ok=True)

    def extract2images(self):
        """
        Extract the HTML file to images and save it to the output directory

        """
        url = unquote(self.url)
        with open(f"{self.output_dir}/index.html", "r") as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, "html.parser")
        for imgtag in soup.find_all("img"):
            img_url = imgtag.get("src")
            if not img_url.startswith("http"):
                img_url = urljoin(url, img_url)
            img_data = requests.get(img_url).content
            img_name = quote(imgtag["src"], "")

            with open(f"{self.output_dir}/images/{img_name}", "wb") as f:
                f.write(img_data)
            logger.info(f"Extracted the HTML file from {url} to images")

extract2images()

Extract the HTML file to images and save it to the output directory

Source code in Docs2KG/parser/web/web2images.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def extract2images(self):
    """
    Extract the HTML file to images and save it to the output directory

    """
    url = unquote(self.url)
    with open(f"{self.output_dir}/index.html", "r") as f:
        html_content = f.read()
    soup = BeautifulSoup(html_content, "html.parser")
    for imgtag in soup.find_all("img"):
        img_url = imgtag.get("src")
        if not img_url.startswith("http"):
            img_url = urljoin(url, img_url)
        img_data = requests.get(img_url).content
        img_name = quote(imgtag["src"], "")

        with open(f"{self.output_dir}/images/{img_name}", "wb") as f:
            f.write(img_data)
        logger.info(f"Extracted the HTML file from {url} to images")