Skip to content

Pdf2tables

PDF2Tables

Bases: PDFParserBase

Source code in Docs2KG/parser/pdf/pdf2tables.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class PDF2Tables(PDFParserBase):
    def __init__(self, *args, **kwargs):
        """
        Initialize the class with the pdf file
        """
        super().__init__(*args, **kwargs)
        self.table_output_dir = self.output_dir / "tables"
        self.table_output_dir.mkdir(parents=True, exist_ok=True)

    def extract2tables(self, output_csv: bool = False) -> pd.DataFrame:
        """
        Extract Tables from the pdf file

        Args:
            output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

        Returns:
            pd.DataFrame: The dataframe containing the table information
        """
        doc = fitz.open(self.pdf_file)  # open a document
        tables_list = []

        for page_index in range(len(doc)):  # iterate over pdf pages
            page = doc[page_index]  # get the page
            tabs = page.find_tables()
            if tabs.tables:
                logger.debug(f"Found {len(tabs.tables)} tables on page {page_index}")
                for table_index, tab in enumerate(tabs.tables, start=1):
                    # save to csv

                    filename = "page_%s-table_%s.csv" % (page_index, table_index)
                    # save it to bounding box cropped image
                    df = tab.to_pandas()
                    df.to_csv(self.table_output_dir / filename)
                    logger.debug(tab.bbox)
                    tables_list.append(
                        {
                            "page_index": page_index,
                            "table_index": table_index,
                            "bbox": tab.bbox,
                            "filename": filename,
                            "file_path": self.table_output_dir / filename,
                        }
                    )
        df = pd.DataFrame(tables_list)
        if output_csv:
            df.to_csv(self.table_output_dir / "tables.csv", index=False)
        return df

__init__(*args, **kwargs)

Initialize the class with the pdf file

Source code in Docs2KG/parser/pdf/pdf2tables.py
11
12
13
14
15
16
17
def __init__(self, *args, **kwargs):
    """
    Initialize the class with the pdf file
    """
    super().__init__(*args, **kwargs)
    self.table_output_dir = self.output_dir / "tables"
    self.table_output_dir.mkdir(parents=True, exist_ok=True)

extract2tables(output_csv=False)

Extract Tables from the pdf file

Parameters:

Name Type Description Default
output_csv bool

Whether to output the extracted data to a csv file. Defaults to False.

False

Returns:

Type Description
DataFrame

pd.DataFrame: The dataframe containing the table information

Source code in Docs2KG/parser/pdf/pdf2tables.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def extract2tables(self, output_csv: bool = False) -> pd.DataFrame:
    """
    Extract Tables from the pdf file

    Args:
        output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

    Returns:
        pd.DataFrame: The dataframe containing the table information
    """
    doc = fitz.open(self.pdf_file)  # open a document
    tables_list = []

    for page_index in range(len(doc)):  # iterate over pdf pages
        page = doc[page_index]  # get the page
        tabs = page.find_tables()
        if tabs.tables:
            logger.debug(f"Found {len(tabs.tables)} tables on page {page_index}")
            for table_index, tab in enumerate(tabs.tables, start=1):
                # save to csv

                filename = "page_%s-table_%s.csv" % (page_index, table_index)
                # save it to bounding box cropped image
                df = tab.to_pandas()
                df.to_csv(self.table_output_dir / filename)
                logger.debug(tab.bbox)
                tables_list.append(
                    {
                        "page_index": page_index,
                        "table_index": table_index,
                        "bbox": tab.bbox,
                        "filename": filename,
                        "file_path": self.table_output_dir / filename,
                    }
                )
    df = pd.DataFrame(tables_list)
    if output_csv:
        df.to_csv(self.table_output_dir / "tables.csv", index=False)
    return df