Base

`WebParserBase`

Source code in Docs2KG/parser/web/base.py

class WebParserBase:
    def __init__(
        self, url: str, output_dir: Path = None, input_dir: Path = None
    ) -> None:
        """
        Initialize the WebParserBase class

        Args:
            url (str): URL to download the HTML files
            output_dir (Path): Path to the output directory where the converted files will be saved
            input_dir (Path): Path to the input directory where the html files will be downloaded
        """
        self.url = url
        self.output_dir = output_dir
        self.input_dir = input_dir
        self.quoted_url = quote(url, "")
        if self.output_dir is None:
            self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
            self.output_dir.mkdir(parents=True, exist_ok=True)

        self.download_html_file()

    def download_html_file(self):
        """
        Download the html file from the url and save it to the input directory

        """
        response = requests.get(self.url)
        if response.status_code == 200:
            with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
                f.write(response.content)
            logger.info(f"Downloaded the HTML file from {self.url}")
        else:
            logger.error(f"Failed to download the HTML file from {self.url}")

`init(url, output_dir=None, input_dir=None)`

Initialize the WebParserBase class

Parameters:

Name	Type	Description	Default
`url`	`str`	URL to download the HTML files	required
`output_dir`	`Path`	Path to the output directory where the converted files will be saved	`None`
`input_dir`	`Path`	Path to the input directory where the html files will be downloaded	`None`

Source code in Docs2KG/parser/web/base.py

def __init__(
    self, url: str, output_dir: Path = None, input_dir: Path = None
) -> None:
    """
    Initialize the WebParserBase class

    Args:
        url (str): URL to download the HTML files
        output_dir (Path): Path to the output directory where the converted files will be saved
        input_dir (Path): Path to the input directory where the html files will be downloaded
    """
    self.url = url
    self.output_dir = output_dir
    self.input_dir = input_dir
    self.quoted_url = quote(url, "")
    if self.output_dir is None:
        self.output_dir = DATA_OUTPUT_DIR / self.quoted_url
        self.output_dir.mkdir(parents=True, exist_ok=True)

    self.download_html_file()

`download_html_file()`

Download the html file from the url and save it to the input directory

Source code in Docs2KG/parser/web/base.py

def download_html_file(self):
    """
    Download the html file from the url and save it to the input directory

    """
    response = requests.get(self.url)
    if response.status_code == 200:
        with open(f"{DATA_INPUT_DIR}/index.html", "wb") as f:
            f.write(response.content)
        logger.info(f"Downloaded the HTML file from {self.url}")
    else:
        logger.error(f"Failed to download the HTML file from {self.url}")

Base

WebParserBase

__init__(url, output_dir=None, input_dir=None)

download_html_file()

`WebParserBase`

`init(url, output_dir=None, input_dir=None)`

`download_html_file()`