Skip to content

Excel2image

Excel2Image

Bases: ExcelParseBase

Source code in Docs2KG/parser/excel/excel2image.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class Excel2Image(ExcelParseBase):
    def __init__(self, *args, **kwargs):
        """
        Initialize the Excel2Image class.
        """
        super().__init__(*args, **kwargs)
        self.image_output_dir = self.output_dir / "images"
        self.image_output_dir.mkdir(parents=True, exist_ok=True)

    def excel2image_and_pdf(self):
        """
        Convert an Excel file to image and pdf files.
        """
        images = []
        xls = pd.ExcelFile(self.excel_file)
        index = 0
        # Loop through each sheet in the Excel file
        for sheet_name in xls.sheet_names:
            # Read the sheet into a DataFrame
            df = pd.read_excel(self.excel_file, sheet_name=sheet_name)
            df.columns = [
                "" if col.startswith("Unnamed") else col for col in df.columns
            ]
            df = df.fillna("")  # Replace NaN values with an empty string
            # Convert the DataFrame to HTML
            html = df.to_html()
            # Save the HTML to an image file
            imgkit.from_string(html, f"{self.image_output_dir}/{sheet_name}.png")
            logger.info(f"Image saved to {self.image_output_dir}/{sheet_name}.png")
            # pdfkit.from_string(html, f"{self.image_output_dir}/{sheet_name}.pdf")
            # logger.info(f"PDF saved to {self.image_output_dir}/{sheet_name}.pdf")

            images.append(
                {
                    "page_index": index,
                    "filename": f"{sheet_name}.png",
                    "file_path": f"{self.image_output_dir}/{sheet_name}.png",
                    "sheet_name": sheet_name,
                }
            )
            index += 1
        images_df = pd.DataFrame(images)
        images_df.to_csv(self.image_output_dir / "images.csv", index=False)
        logger.info(f"Images metadata saved to {self.image_output_dir}/images.csv")

__init__(*args, **kwargs)

Initialize the Excel2Image class.

Source code in Docs2KG/parser/excel/excel2image.py
14
15
16
17
18
19
20
def __init__(self, *args, **kwargs):
    """
    Initialize the Excel2Image class.
    """
    super().__init__(*args, **kwargs)
    self.image_output_dir = self.output_dir / "images"
    self.image_output_dir.mkdir(parents=True, exist_ok=True)

excel2image_and_pdf()

Convert an Excel file to image and pdf files.

Source code in Docs2KG/parser/excel/excel2image.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def excel2image_and_pdf(self):
    """
    Convert an Excel file to image and pdf files.
    """
    images = []
    xls = pd.ExcelFile(self.excel_file)
    index = 0
    # Loop through each sheet in the Excel file
    for sheet_name in xls.sheet_names:
        # Read the sheet into a DataFrame
        df = pd.read_excel(self.excel_file, sheet_name=sheet_name)
        df.columns = [
            "" if col.startswith("Unnamed") else col for col in df.columns
        ]
        df = df.fillna("")  # Replace NaN values with an empty string
        # Convert the DataFrame to HTML
        html = df.to_html()
        # Save the HTML to an image file
        imgkit.from_string(html, f"{self.image_output_dir}/{sheet_name}.png")
        logger.info(f"Image saved to {self.image_output_dir}/{sheet_name}.png")
        # pdfkit.from_string(html, f"{self.image_output_dir}/{sheet_name}.pdf")
        # logger.info(f"PDF saved to {self.image_output_dir}/{sheet_name}.pdf")

        images.append(
            {
                "page_index": index,
                "filename": f"{sheet_name}.png",
                "file_path": f"{self.image_output_dir}/{sheet_name}.png",
                "sheet_name": sheet_name,
            }
        )
        index += 1
    images_df = pd.DataFrame(images)
    images_df.to_csv(self.image_output_dir / "images.csv", index=False)
    logger.info(f"Images metadata saved to {self.image_output_dir}/images.csv")