Skip to content

Base

WebParserBase

Source code in Docs2KG/parser/web/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class WebParserBase:
    def __init__(
        self, url: str, output_dir: Path = None, input_dir: Path = None
    ) -> None:
        """
        Initialize the WebParserBase class

        Args:
            url (str): URL to download the HTML files
            output_dir (Path): Path to the output directory where the converted files will be saved
            input_dir (Path): Path to the input directory where the html files will be downloaded
        """
        self.url = url
        self.output_dir = output_dir
        self.input_dir = input_dir
        self.quoted_url = quote(url, "")
        if self.output_dir is None:
            self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
            self.output_dir.mkdir(parents=True, exist_ok=True)

        self.download_html_file()

    def download_html_file(self):
        """
        Download the html file from the url and save it to the input directory

        """
        response = requests.get(self.url)
        if response.status_code == 200:
            with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
                f.write(response.content)
            logger.info(f"Downloaded the HTML file from {self.url}")
        else:
            logger.error(f"Failed to download the HTML file from {self.url}")

__init__(url, output_dir=None, input_dir=None)

Initialize the WebParserBase class

Parameters:

Name Type Description Default
url str

URL to download the HTML files

required
output_dir Path

Path to the output directory where the converted files will be saved

None
input_dir Path

Path to the input directory where the html files will be downloaded

None
Source code in Docs2KG/parser/web/base.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def __init__(
    self, url: str, output_dir: Path = None, input_dir: Path = None
) -> None:
    """
    Initialize the WebParserBase class

    Args:
        url (str): URL to download the HTML files
        output_dir (Path): Path to the output directory where the converted files will be saved
        input_dir (Path): Path to the input directory where the html files will be downloaded
    """
    self.url = url
    self.output_dir = output_dir
    self.input_dir = input_dir
    self.quoted_url = quote(url, "")
    if self.output_dir is None:
        self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
        self.output_dir.mkdir(parents=True, exist_ok=True)

    self.download_html_file()

download_html_file()

Download the html file from the url and save it to the input directory

Source code in Docs2KG/parser/web/base.py
34
35
36
37
38
39
40
41
42
43
44
45
def download_html_file(self):
    """
    Download the html file from the url and save it to the input directory

    """
    response = requests.get(self.url)
    if response.status_code == 200:
        with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
            f.write(response.content)
        logger.info(f"Downloaded the HTML file from {self.url}")
    else:
        logger.error(f"Failed to download the HTML file from {self.url}")