Skip to content

Excel2markdown

Excel2Markdown

Bases: ExcelParseBase

Source code in Docs2KG/parser/excel/excel2markdown.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class Excel2Markdown(ExcelParseBase):
    def __init__(self, *args, **kwargs):
        """
        Initialize the Excel2Table class.
        """
        super().__init__(*args, **kwargs)
        self.text_output = self.output_dir / "texts"
        self.text_output.mkdir(parents=True, exist_ok=True)
        self.md_csv = self.text_output / "md.csv"

    def extract2markdown(self):
        # A simple example to extract tables from Excel
        md_csv = []
        df = pd.read_excel(self.excel_file, sheet_name=None)
        index = 0
        for sheet_name, sheet_data in df.items():
            # to markdown
            md_csv.append(
                {
                    "sheet_name": sheet_name,
                    "text": sheet_data.to_markdown(),
                    "page_number": index,
                }
            )
            index += 1
        md_df = pd.DataFrame(md_csv)
        md_df.to_csv(self.md_csv, index=False)

__init__(*args, **kwargs)

Initialize the Excel2Table class.

Source code in Docs2KG/parser/excel/excel2markdown.py
10
11
12
13
14
15
16
17
def __init__(self, *args, **kwargs):
    """
    Initialize the Excel2Table class.
    """
    super().__init__(*args, **kwargs)
    self.text_output = self.output_dir / "texts"
    self.text_output.mkdir(parents=True, exist_ok=True)
    self.md_csv = self.text_output / "md.csv"