Source code for tabularxls.tabular_utils

# -*- coding: utf-8 -*-
"""
Helper functions and classes for working with tabular data
"""
import logging
import re
import pandas as pd
from pandas import DataFrame
from pandas.io.formats.excel import ExcelFormatter
import matplotlib.colors as mlc
from pathlib import Path
from typing import Union

try:
    import cbsplotlib
except ImportError:
    cbsplotlib = None
else:
    import cbsplotlib.colors as cbc

_logger = logging.getLogger(__name__)


[docs] def get_color_names(min_color_length=2): """ Get the color name definitions obtained from the matplot default list Args: min_color_length (int): minimum length of the color names Notes: * By default, all matlotlib colors are taken * In case cbsplotlib is installed, also all CBS color definitions are taken Returns: list: All the default color names """ colors = [c.replace("xkcd:", "") for c in mlc.get_named_colors_mapping().keys()] if cbsplotlib is not None: cbs_colors = [c.replace("cbs:", "") for c in cbc.CBS_COLORS.keys()] colors.extend(cbs_colors) defaults_colors = [c for c in colors if len(c) > min_color_length] return defaults_colors
[docs] def get_color_code(color_name: str): """ Get the code belonging to a color name Args: color_name (str): Name of the color Returns: str: Color code """ color_code = None if cbsplotlib is not None: # in case cbsplotlib is imported, first try to obtain the CBS color definition try: color_code = cbc.CBS_COLORS_HEX[color_name] except KeyError: try: color_code = cbc.CBS_COLORS_HEX["cbs:" + color_name] except KeyError: pass if color_code is None: # if no color code has been found yet, try to obtain it from the matplotlib color definitions try: color_code = mlc.get_named_colors_mapping()[color_name] except KeyError: _logger.info(f"Could find a value for the color name '{color_name}'") return color_code
[docs] def get_super(content): """ Convert normal characters to superscript codes Args: content (str): The string for which all characters need to be converted Returns: superscript_content (str): New string in only superscript characters """ normal = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-=()" super_s = "ᴬᴮᶜᴰᴱᶠᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾQᴿˢᵀᵁⱽᵂˣʸᶻᵃᵇᶜᵈᵉᶠᵍʰᶦʲᵏˡᵐⁿᵒᵖ۹ʳˢᵗᵘᵛʷˣʸᶻ⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾" translations = content.maketrans("".join(normal), "".join(super_s)) superscript_content = content.translate(translations) return superscript_content
[docs] def replace_textsuper(cell): """ Replace LaTeX textsuperscript characters with superscript characters Args: cell (str): Cell contents for which the textsuperscript needs to be translated into superscript Notes: * Superscript in LaTeX is given with the *\\textsuperscript{}* command Returns: new_cell (str): Cell contents with all superscript translated into superscript characters """ if match := re.search("\\\\textsuperscript{(.*?)}", cell): content = match.group(1) content = clean_the_cells([content])[0] content = get_super(content) new_cell = re.sub("\\\\textsuperscript{(.*?)}", content, cell) else: new_cell = cell return new_cell
[docs] def get_multicolumns(clean_cell): """ Get the cell contents of a multicolumn cell Args: clean_cell (str): Cell contents of a multicolumn cell Returns: first_cell (str), n_col (int): The contents of the first cell and the number of following multicolumn cells """ if match := re.search("\\\\multicolumn{(.*?)}", clean_cell): n_col = int(match.group(1)) new_match = re.sub("\\\\multicolumn{(.*?)}", "", clean_cell) cell_format, first_cell = get_new_command(new_match) else: first_cell = clean_cell n_col = None return first_cell, n_col
[docs] def get_new_command(line): """ Get the contents of a LaTeX newcommand definition Args: line (str): Line potentially containing a newcommand definition Returns: """ parse_alias = True alias = list() pattern = list() curl_level = 0 for char in list(line): if char == "{": curl_level += 1 elif char == "}": curl_level -= 1 if curl_level > 0: if parse_alias: alias.append(char) else: pattern.append(char) else: if alias: parse_alias = False alias = "".join(alias) pattern = "".join(pattern) clean_patterns = clean_the_cells([alias, pattern]) return clean_patterns
[docs] def clean_the_cells(cells, aliases=None): """ Remove all spurious latex code from cell contents Args: cells (list): List of cells containing strings to be cleaned aliases (dict, optional): If aliases are passed (default None), all strings will be cleaned with the replacements defined in the aliases Returns: list: The new cell contents """ clean_cells = list() for cell in cells: clean_cell = replace_textsuper(cell) clean_cell, n_col = get_multicolumns(clean_cell) clean_cell = clean_cell.replace("\\rowcolor{white}", "") clean_cell = clean_cell.replace("\\cornercell{", "") clean_cell = clean_cell.replace("\\normalsize{", "") clean_cell = clean_cell.replace("\\textbf{", "") clean_cell = clean_cell.replace("\\emph{", "") clean_cell = clean_cell.replace("\\python{", "") clean_cell = clean_cell.replace("\\textemdash", "-") clean_cell = clean_cell.replace("\\textendash", "-") clean_cell = clean_cell.replace("\\numprint{", "") clean_cell = re.sub(r"\\hspace{.*?}", "", clean_cell) clean_cell = re.sub(r"\\vspace{.*?}", "", clean_cell) clean_cell = clean_cell.replace("}", "") clean_cell = clean_cell.replace("{", "") clean_cell = clean_cell.replace("\\", "") clean_cell = clean_cell.replace("--", "-") if aliases is not None: for alias, pattern in aliases.items(): if match := re.match(alias, clean_cell): clean_cell = clean_cell.replace(alias, pattern) clean_cells.append(clean_cell.strip()) if n_col is not None and n_col > 1: for ii in range(1, n_col): clean_cells.append("") return clean_cells
[docs] def parse_tabular( input_filename: Union[str, Path], multi_index: bool = False, search_and_replace: Union[dict, None] = None, encoding: str = "utf-8", top_row_merge: bool = False, ) -> DataFrame: """ Read the tabular file and convert contents to a data frame Args: input_filename (str or Path): Name of the LaTeX tabular file. multi_index (bool, optional): Convert the index into a multi index based on the first 2 columns. Defaults to False. search_and_replace (dict, optional): The search and replace strings stored in a dictionary. Defaults to None. encoding (str, optional): Encoding of the input file. Defaults to "utf-8" top_row_merge (bool, optional). Merge the top rows in the rows are multirow headers Returns: DataFrame: The cleaned tubular data stored in a dataframe """ _logger.debug(f"Reading file {input_filename}") with open(input_filename, encoding=encoding) as fp: lines = fp.readlines() rows = list() header_row = None aliases = dict() for line in lines: clean_line = line.strip() if clean_line.startswith("%") or clean_line == "": continue match = re.search("caption{(.*)}", clean_line) if match is not None: caption = match.group(1) _logger.debug(f"CAPTION : {caption}") match = re.search("newcommand", clean_line) if match is not None: alias, pattern = get_new_command(clean_line) aliases[alias] = pattern _logger.debug(f"alias {alias} -> {pattern}") # hyperref halen we weg # de pattern '\\hyperref[mijnref]{content cell}' vervangen we met 'content cell' clean_line = re.sub(r"\\hyperref\[.*\]{(.*)}", r"\1", clean_line) cells = clean_line.split("&") if len(cells) > 1: clean_cells = clean_the_cells(cells, aliases) if header_row is None: header_row = clean_cells else: rows.append(clean_cells) _logger.debug(f"INSIDE : {clean_line}") else: _logger.debug(f"OUTSIZE : {clean_line}") index_columns = header_row[0] empty_column_names = False if multi_index: if header_row[0] == "": header_row[0] = "l1" if header_row[1] == "": header_row[1] = "l2" table_df = pd.DataFrame.from_records(rows, columns=header_row) index_columns = ["l1", "l2"] empty_column_names = True else: if index_columns in header_row[1:]: _logger.warning( f"Your index columns has the same value '{index_columns}' as a column name. " "This might cause problems. Replacing now with index" ) index_columns = "index" header_row[0] = index_columns table_df = pd.DataFrame.from_records(rows, columns=header_row) if top_row_merge: # De eerste rij beschouwen als een multi column. Fix dat table_df = table_df.T.reset_index() first_two_columns = table_df.columns[:2].to_list() table_df = table_df.set_index(first_two_columns) table_df.index = table_df.index.rename(["", ""]) table_df = table_df.T first_single_col = table_df.columns[:1].to_list() name = first_single_col[0][1] table_df.set_index(first_single_col, drop=True, inplace=True) table_df.index = table_df.index.rename(name) top_name = table_df.columns[0][0] new_columns = ["/".join([top_name, mc[1]]) for mc in table_df.columns] table_df.columns = new_columns else: table_df.set_index(index_columns, drop=True, inplace=True) if empty_column_names: table_df.index = table_df.index.rename(["", ""]) for alias, pattern in aliases.items(): for col_name in table_df.columns: try: alias_exact = "^" + alias + "$" table_df[col_name] = table_df[col_name].str.replace( alias_exact, pattern, regex=True ) except AttributeError: pass if search_and_replace is not None: # to make sure that also regex in the index are replaced, reset is needed index_names = list(table_df.index.names) column_names = list(table_df.columns) new_column_names = column_names.copy() new_index_names = index_names.copy() table_df.reset_index(inplace=True) # replace all the search strings for search, replace in search_and_replace.items(): table_df.replace(search, replace, regex=True, inplace=True) new_index_names = [ re.sub(search, replace, name) for name in new_index_names ] new_names_columns = [ re.sub(search, replace, name) for name in new_column_names ] # put back table_df.set_index(index_columns, inplace=True, drop=True) if not all(x == y for x, y in zip(column_names, new_names_columns)): table_df.columns = new_names_columns if not all(x == y for x, y in zip(index_names, new_index_names)): table_df.index.names = new_index_names return table_df
[docs] class WorkBook: """ This class is responsible for working with Excel data Args: workbook: Excel workbook object to modify Attributes: left_align_italic (workbook format or None) left_align_italic_large (workbook format or None) left_align_italic_large_ul (workbook format or None) : setup for workbook left_align_helvetica (workbook format or None) : setup for workbook left_align_helvetica_bold (workbook format or None) : setup for workbook left_align_bold (workbook format or None) : setup for workbook left_align_bold_large (workbook format or None) : setup for workbook left_align_bold_larger (workbook format or None) : setup for workbook left_align (workbook format or None) : setup for workbook left_align_large_wrap (workbook format or None) : setup for workbook left_align_large_wrap_top (workbook format or None) : setup for workbook left_align_wrap (workbook format or None) : setup for workbook left_align_large (workbook format or None) : setup for workbook right_align (workbook format or None) : setup for workbook header_format (workbook format or None) : setup for workbook title_format (workbook format or None) : setup for workbook section_heading (workbook format or None) : setup for workbook footer_format (workbook format or None) : setup for workbook """ def __init__(self, workbook): """ Constructor of the Workbook class """ self.workbook = workbook self.left_align_italic = None self.left_align_italic_large = None self.left_align_italic_large_ul = None self.left_align_helvetica = None self.left_align_helvetica_bold = None self.left_align_bold = None self.left_align_bold_large = None self.left_align_bold_larger = None self.left_align = None self.left_align_large_wrap = None self.left_align_large_wrap_top = None self.left_align_wrap = None self.left_align_large = None self.right_align = None self.header_format = None self.title_format = None self.section_heading = None self.footer_format = None self.add_styles()
[docs] def add_styles(self): """ Add all the styles to this workbook """ self.left_align_helvetica = self.workbook.add_format( {"font": "helvetica", "align": "left", "font_size": 8, "border": 0} ) self.left_align_helvetica_bold = self.workbook.add_format( { "font": "helvetica", "bold": True, "align": "left", "font_size": 8, "border": 0, } ) self.left_align_italic = self.workbook.add_format( { "font": "arial", "italic": True, "align": "left", "font_size": 8, "border": 0, } ) self.left_align_italic_large = self.workbook.add_format( { "font": "arial", "italic": True, "align": "left", "font_size": 10, "border": 0, } ) self.left_align_italic_large_ul = self.workbook.add_format( { "font": "arial", "italic": True, "align": "left", "underline": True, "font_size": 10, "border": 0, } ) self.left_align_bold = self.workbook.add_format( { "font": "arial", "bold": True, "align": "left", "font_size": 8, "border": 0, } ) self.left_align_bold_large = self.workbook.add_format( { "font": "arial", "bold": True, "align": "left", "font_size": 10, "border": 0, } ) self.left_align_bold_larger = self.workbook.add_format( { "font": "arial", "bold": True, "align": "left", "font_size": 12, "border": 0, } ) self.left_align = self.workbook.add_format( {"font": "arial", "align": "left", "font_size": 8, "border": 0} ) self.left_align_large_wrap = self.workbook.add_format( { "font": "arial", "align": "left", "text_wrap": True, "font_size": 10, "border": 0, } ) self.left_align_large_wrap_top = self.workbook.add_format( { "font": "arial", "align": "left", "valign": "top", "text_wrap": True, "font_size": 10, "border": 0, } ) self.left_align_large = self.workbook.add_format( {"font": "arial", "align": "left", "font_size": 10, "border": 0} ) self.right_align = self.workbook.add_format( {"font": "arial", "align": "right", "font_size": 8, "border": 0} ) self.header_format = self.workbook.add_format( { "font": "arial", "bold": True, "italic": True, "text_wrap": True, "align": "left", "font_size": 8, } ) self.header_format.set_bottom() self.header_format.set_top() self.title_format = self.workbook.add_format( { "font": "arial", "bold": True, "italic": False, "text_wrap": True, "align": "centre", "font_size": 12, } ) self.section_heading = self.workbook.add_format( { "font": "arial", "bold": True, "italic": True, "text_wrap": True, "align": "left", "font_size": 11, } ) self.footer_format = self.workbook.add_format( { "font": "arial", "align": "left", "font_size": 8, } ) self.footer_format.set_top()
[docs] def set_format(self, color_name): """ Add color codes to the workbook style Args: color_name (str): Name of the color to add to this workbook Returns: str: Cell format with the color code definition assigned to it """ color_code = get_color_code(color_name) if color_code is not None: # a color code was found. add it as a definition to the work book cell_format = self.workbook.add_format({"font_size": 8}) cell_format.set_font_color(color_code) else: cell_format = None return cell_format
[docs] def update_width(label, max_width=None): """ Update the width of the current max_width based on the contents of the label Args: label (str): Label to check if its width a wider than max_width max_width (int or None): Current maximum width Returns: int: Maximum width encountered so far """ width = len(label) if max_width is None or width > max_width: max_width = width return max_width
[docs] def get_max_width(input_data, name, column_index=None): """ Determine the maximum string in an index or column of a Dataframe Args: input_data (DataFrame): The dataframe to check name (str): Name of the column to check column_index (int or None): Index of the column to check Returns: int: The maximum width of this column or index """ max_col_width = len(name) if column_index is None: # get the values of the column based on the name of the column values = input_data[name] else: # get the values of the column based on the index of the column values = input_data.iloc[:, column_index] for value in values: col_width = len(str(value)) if col_width > max_col_width: max_col_width = col_width return max_col_width
[docs] def find_color_name(line: str, minimal_color_length=2): """ Find the color name of a str Args: line (str): Line in which to find the color name minimal_color_length (int): Minimum color length Returns: str: The color found int the line """ found_color = None default_colors = get_color_names(min_color_length=minimal_color_length) for color_name in default_colors: if line.startswith(color_name): found_color = color_name break return found_color
[docs] def write_data_to_sheet_multiindex( data_df: DataFrame, file_name: str | Path, sheet_name="Sheet" ): """ Write the data to Excel file with format Args: data_df (DataFrame): The dataframe to write to Excel file_name (str): Name of the Excel file with format sheet_name (str): Name of the sheet to write to """ with pd.ExcelWriter(file_name, engine="xlsxwriter") as writer: data_df.to_excel(excel_writer=writer, sheet_name=sheet_name) ExcelFormatter.header_style = None workbook = writer.book worksheet = writer.sheets[sheet_name] wb = WorkBook(workbook=workbook) # we now just reset the index such that we loop over column only data_df.reset_index(inplace=True, allow_duplicates=True) character_width = 1 start_row = 0 for col_idx, column_name in enumerate(data_df.columns): col_width = get_max_width( input_data=data_df, name=column_name, column_index=col_idx ) _logger.info(f"Adjusting {column_name}/{col_idx} with width {col_width}") align = wb.left_align worksheet.set_column( col_idx, col_idx, col_width * character_width, cell_format=align ) worksheet.write(start_row, col_idx, column_name, wb.header_format) for idx, value in enumerate(data_df[column_name]): found_color_name = find_color_name(value) if found_color_name is not None: _logger.debug(f"Going to set {value} {found_color_name}") cell_format = wb.set_format(found_color_name) new_value = value.replace(found_color_name, "") if cell_format is not None: worksheet.write(idx + 1, col_idx, new_value, cell_format) else: _logger.debug("No color found") worksheet.write(idx + 1, col_idx, new_value) _logger.debug("Done")