# -*- coding: utf-8 -*-
"""
Helper functions and classes for working with tabular data
"""
import logging
import re
import pandas as pd
from pandas import DataFrame
from pandas.io.formats.excel import ExcelFormatter
import matplotlib.colors as mlc
from pathlib import Path
from typing import Union
try:
import cbsplotlib
except ImportError:
cbsplotlib = None
else:
import cbsplotlib.colors as cbc
_logger = logging.getLogger(__name__)
[docs]
def get_color_names(min_color_length=2):
"""
Get the color name definitions obtained from the matplot default list
Args:
min_color_length (int): minimum length of the color names
Notes:
* By default, all matlotlib colors are taken
* In case cbsplotlib is installed, also all CBS color definitions are taken
Returns:
list: All the default color names
"""
colors = [c.replace("xkcd:", "") for c in mlc.get_named_colors_mapping().keys()]
if cbsplotlib is not None:
cbs_colors = [c.replace("cbs:", "") for c in cbc.CBS_COLORS.keys()]
colors.extend(cbs_colors)
defaults_colors = [c for c in colors if len(c) > min_color_length]
return defaults_colors
[docs]
def get_color_code(color_name: str):
"""
Get the code belonging to a color name
Args:
color_name (str): Name of the color
Returns:
str: Color code
"""
color_code = None
if cbsplotlib is not None:
# in case cbsplotlib is imported, first try to obtain the CBS color definition
try:
color_code = cbc.CBS_COLORS_HEX[color_name]
except KeyError:
try:
color_code = cbc.CBS_COLORS_HEX["cbs:" + color_name]
except KeyError:
pass
if color_code is None:
# if no color code has been found yet, try to obtain it from the matplotlib color definitions
try:
color_code = mlc.get_named_colors_mapping()[color_name]
except KeyError:
_logger.info(f"Could find a value for the color name '{color_name}'")
return color_code
[docs]
def get_super(content):
"""
Convert normal characters to superscript codes
Args:
content (str): The string for which all characters need to be converted
Returns:
superscript_content (str): New string in only superscript characters
"""
normal = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-=()"
super_s = "ᴬᴮᶜᴰᴱᶠᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾQᴿˢᵀᵁⱽᵂˣʸᶻᵃᵇᶜᵈᵉᶠᵍʰᶦʲᵏˡᵐⁿᵒᵖ۹ʳˢᵗᵘᵛʷˣʸᶻ⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾"
translations = content.maketrans("".join(normal), "".join(super_s))
superscript_content = content.translate(translations)
return superscript_content
[docs]
def replace_textsuper(cell):
"""
Replace LaTeX textsuperscript characters with superscript characters
Args:
cell (str): Cell contents for which the textsuperscript needs to be translated into superscript
Notes:
* Superscript in LaTeX is given with the *\\textsuperscript{}* command
Returns:
new_cell (str): Cell contents with all superscript translated into superscript characters
"""
if match := re.search("\\\\textsuperscript{(.*?)}", cell):
content = match.group(1)
content = clean_the_cells([content])[0]
content = get_super(content)
new_cell = re.sub("\\\\textsuperscript{(.*?)}", content, cell)
else:
new_cell = cell
return new_cell
[docs]
def get_multicolumns(clean_cell):
"""
Get the cell contents of a multicolumn cell
Args:
clean_cell (str): Cell contents of a multicolumn cell
Returns:
first_cell (str), n_col (int): The contents of the first cell and the number of following multicolumn cells
"""
if match := re.search("\\\\multicolumn{(.*?)}", clean_cell):
n_col = int(match.group(1))
new_match = re.sub("\\\\multicolumn{(.*?)}", "", clean_cell)
cell_format, first_cell = get_new_command(new_match)
else:
first_cell = clean_cell
n_col = None
return first_cell, n_col
[docs]
def get_new_command(line):
"""
Get the contents of a LaTeX newcommand definition
Args:
line (str): Line potentially containing a newcommand definition
Returns:
"""
parse_alias = True
alias = list()
pattern = list()
curl_level = 0
for char in list(line):
if char == "{":
curl_level += 1
elif char == "}":
curl_level -= 1
if curl_level > 0:
if parse_alias:
alias.append(char)
else:
pattern.append(char)
else:
if alias:
parse_alias = False
alias = "".join(alias)
pattern = "".join(pattern)
clean_patterns = clean_the_cells([alias, pattern])
return clean_patterns
[docs]
def clean_the_cells(cells, aliases=None):
"""
Remove all spurious latex code from cell contents
Args:
cells (list): List of cells containing strings to be cleaned
aliases (dict, optional): If aliases are passed (default None), all strings will be cleaned with the
replacements defined in the aliases
Returns:
list: The new cell contents
"""
clean_cells = list()
for cell in cells:
clean_cell = replace_textsuper(cell)
clean_cell, n_col = get_multicolumns(clean_cell)
clean_cell = clean_cell.replace("\\rowcolor{white}", "")
clean_cell = clean_cell.replace("\\cornercell{", "")
clean_cell = clean_cell.replace("\\normalsize{", "")
clean_cell = clean_cell.replace("\\textbf{", "")
clean_cell = clean_cell.replace("\\emph{", "")
clean_cell = clean_cell.replace("\\python{", "")
clean_cell = clean_cell.replace("\\textemdash", "-")
clean_cell = clean_cell.replace("\\textendash", "-")
clean_cell = clean_cell.replace("\\numprint{", "")
clean_cell = re.sub(r"\\hspace{.*?}", "", clean_cell)
clean_cell = re.sub(r"\\vspace{.*?}", "", clean_cell)
clean_cell = clean_cell.replace("}", "")
clean_cell = clean_cell.replace("{", "")
clean_cell = clean_cell.replace("\\", "")
clean_cell = clean_cell.replace("--", "-")
if aliases is not None:
for alias, pattern in aliases.items():
if match := re.match(alias, clean_cell):
clean_cell = clean_cell.replace(alias, pattern)
clean_cells.append(clean_cell.strip())
if n_col is not None and n_col > 1:
for ii in range(1, n_col):
clean_cells.append("")
return clean_cells
[docs]
def parse_tabular(
input_filename: Union[str, Path],
multi_index: bool = False,
search_and_replace: Union[dict, None] = None,
encoding: str = "utf-8",
top_row_merge: bool = False,
) -> DataFrame:
"""
Read the tabular file and convert contents to a data frame
Args:
input_filename (str or Path): Name of the LaTeX tabular file.
multi_index (bool, optional): Convert the index into a multi index based on the first 2 columns. Defaults to
False.
search_and_replace (dict, optional): The search and replace strings stored in a dictionary. Defaults to None.
encoding (str, optional): Encoding of the input file. Defaults to "utf-8"
top_row_merge (bool, optional). Merge the top rows in the rows are multirow headers
Returns:
DataFrame: The cleaned tubular data stored in a dataframe
"""
_logger.debug(f"Reading file {input_filename}")
with open(input_filename, encoding=encoding) as fp:
lines = fp.readlines()
rows = list()
header_row = None
aliases = dict()
for line in lines:
clean_line = line.strip()
if clean_line.startswith("%") or clean_line == "":
continue
match = re.search("caption{(.*)}", clean_line)
if match is not None:
caption = match.group(1)
_logger.debug(f"CAPTION : {caption}")
match = re.search("newcommand", clean_line)
if match is not None:
alias, pattern = get_new_command(clean_line)
aliases[alias] = pattern
_logger.debug(f"alias {alias} -> {pattern}")
# hyperref halen we weg
# de pattern '\\hyperref[mijnref]{content cell}' vervangen we met 'content cell'
clean_line = re.sub(r"\\hyperref\[.*\]{(.*)}", r"\1", clean_line)
cells = clean_line.split("&")
if len(cells) > 1:
clean_cells = clean_the_cells(cells, aliases)
if header_row is None:
header_row = clean_cells
else:
rows.append(clean_cells)
_logger.debug(f"INSIDE : {clean_line}")
else:
_logger.debug(f"OUTSIZE : {clean_line}")
index_columns = header_row[0]
empty_column_names = False
if multi_index:
if header_row[0] == "":
header_row[0] = "l1"
if header_row[1] == "":
header_row[1] = "l2"
table_df = pd.DataFrame.from_records(rows, columns=header_row)
index_columns = ["l1", "l2"]
empty_column_names = True
else:
if index_columns in header_row[1:]:
_logger.warning(
f"Your index columns has the same value '{index_columns}' as a column name. "
"This might cause problems. Replacing now with index"
)
index_columns = "index"
header_row[0] = index_columns
table_df = pd.DataFrame.from_records(rows, columns=header_row)
if top_row_merge:
# De eerste rij beschouwen als een multi column. Fix dat
table_df = table_df.T.reset_index()
first_two_columns = table_df.columns[:2].to_list()
table_df = table_df.set_index(first_two_columns)
table_df.index = table_df.index.rename(["", ""])
table_df = table_df.T
first_single_col = table_df.columns[:1].to_list()
name = first_single_col[0][1]
table_df.set_index(first_single_col, drop=True, inplace=True)
table_df.index = table_df.index.rename(name)
top_name = table_df.columns[0][0]
new_columns = ["/".join([top_name, mc[1]]) for mc in table_df.columns]
table_df.columns = new_columns
else:
table_df.set_index(index_columns, drop=True, inplace=True)
if empty_column_names:
table_df.index = table_df.index.rename(["", ""])
for alias, pattern in aliases.items():
for col_name in table_df.columns:
try:
alias_exact = "^" + alias + "$"
table_df[col_name] = table_df[col_name].str.replace(
alias_exact, pattern, regex=True
)
except AttributeError:
pass
if search_and_replace is not None:
# to make sure that also regex in the index are replaced, reset is needed
index_names = list(table_df.index.names)
column_names = list(table_df.columns)
new_column_names = column_names.copy()
new_index_names = index_names.copy()
table_df.reset_index(inplace=True)
# replace all the search strings
for search, replace in search_and_replace.items():
table_df.replace(search, replace, regex=True, inplace=True)
new_index_names = [
re.sub(search, replace, name) for name in new_index_names
]
new_names_columns = [
re.sub(search, replace, name) for name in new_column_names
]
# put back
table_df.set_index(index_columns, inplace=True, drop=True)
if not all(x == y for x, y in zip(column_names, new_names_columns)):
table_df.columns = new_names_columns
if not all(x == y for x, y in zip(index_names, new_index_names)):
table_df.index.names = new_index_names
return table_df
[docs]
class WorkBook:
"""
This class is responsible for working with Excel data
Args:
workbook: Excel workbook object to modify
Attributes:
left_align_italic (workbook format or None)
left_align_italic_large (workbook format or None)
left_align_italic_large_ul (workbook format or None) : setup for workbook
left_align_helvetica (workbook format or None) : setup for workbook
left_align_helvetica_bold (workbook format or None) : setup for workbook
left_align_bold (workbook format or None) : setup for workbook
left_align_bold_large (workbook format or None) : setup for workbook
left_align_bold_larger (workbook format or None) : setup for workbook
left_align (workbook format or None) : setup for workbook
left_align_large_wrap (workbook format or None) : setup for workbook
left_align_large_wrap_top (workbook format or None) : setup for workbook
left_align_wrap (workbook format or None) : setup for workbook
left_align_large (workbook format or None) : setup for workbook
right_align (workbook format or None) : setup for workbook
header_format (workbook format or None) : setup for workbook
title_format (workbook format or None) : setup for workbook
section_heading (workbook format or None) : setup for workbook
footer_format (workbook format or None) : setup for workbook
"""
def __init__(self, workbook):
"""
Constructor of the Workbook class
"""
self.workbook = workbook
self.left_align_italic = None
self.left_align_italic_large = None
self.left_align_italic_large_ul = None
self.left_align_helvetica = None
self.left_align_helvetica_bold = None
self.left_align_bold = None
self.left_align_bold_large = None
self.left_align_bold_larger = None
self.left_align = None
self.left_align_large_wrap = None
self.left_align_large_wrap_top = None
self.left_align_wrap = None
self.left_align_large = None
self.right_align = None
self.header_format = None
self.title_format = None
self.section_heading = None
self.footer_format = None
self.add_styles()
[docs]
def add_styles(self):
"""
Add all the styles to this workbook
"""
self.left_align_helvetica = self.workbook.add_format(
{"font": "helvetica", "align": "left", "font_size": 8, "border": 0}
)
self.left_align_helvetica_bold = self.workbook.add_format(
{
"font": "helvetica",
"bold": True,
"align": "left",
"font_size": 8,
"border": 0,
}
)
self.left_align_italic = self.workbook.add_format(
{
"font": "arial",
"italic": True,
"align": "left",
"font_size": 8,
"border": 0,
}
)
self.left_align_italic_large = self.workbook.add_format(
{
"font": "arial",
"italic": True,
"align": "left",
"font_size": 10,
"border": 0,
}
)
self.left_align_italic_large_ul = self.workbook.add_format(
{
"font": "arial",
"italic": True,
"align": "left",
"underline": True,
"font_size": 10,
"border": 0,
}
)
self.left_align_bold = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"align": "left",
"font_size": 8,
"border": 0,
}
)
self.left_align_bold_large = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"align": "left",
"font_size": 10,
"border": 0,
}
)
self.left_align_bold_larger = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"align": "left",
"font_size": 12,
"border": 0,
}
)
self.left_align = self.workbook.add_format(
{"font": "arial", "align": "left", "font_size": 8, "border": 0}
)
self.left_align_large_wrap = self.workbook.add_format(
{
"font": "arial",
"align": "left",
"text_wrap": True,
"font_size": 10,
"border": 0,
}
)
self.left_align_large_wrap_top = self.workbook.add_format(
{
"font": "arial",
"align": "left",
"valign": "top",
"text_wrap": True,
"font_size": 10,
"border": 0,
}
)
self.left_align_large = self.workbook.add_format(
{"font": "arial", "align": "left", "font_size": 10, "border": 0}
)
self.right_align = self.workbook.add_format(
{"font": "arial", "align": "right", "font_size": 8, "border": 0}
)
self.header_format = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"italic": True,
"text_wrap": True,
"align": "left",
"font_size": 8,
}
)
self.header_format.set_bottom()
self.header_format.set_top()
self.title_format = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"italic": False,
"text_wrap": True,
"align": "centre",
"font_size": 12,
}
)
self.section_heading = self.workbook.add_format(
{
"font": "arial",
"bold": True,
"italic": True,
"text_wrap": True,
"align": "left",
"font_size": 11,
}
)
self.footer_format = self.workbook.add_format(
{
"font": "arial",
"align": "left",
"font_size": 8,
}
)
self.footer_format.set_top()
[docs]
def update_width(label, max_width=None):
"""
Update the width of the current max_width based on the contents of the label
Args:
label (str): Label to check if its width a wider than max_width
max_width (int or None): Current maximum width
Returns:
int: Maximum width encountered so far
"""
width = len(label)
if max_width is None or width > max_width:
max_width = width
return max_width
[docs]
def get_max_width(input_data, name, column_index=None):
"""
Determine the maximum string in an index or column of a Dataframe
Args:
input_data (DataFrame): The dataframe to check
name (str): Name of the column to check
column_index (int or None): Index of the column to check
Returns:
int: The maximum width of this column or index
"""
max_col_width = len(name)
if column_index is None:
# get the values of the column based on the name of the column
values = input_data[name]
else:
# get the values of the column based on the index of the column
values = input_data.iloc[:, column_index]
for value in values:
col_width = len(str(value))
if col_width > max_col_width:
max_col_width = col_width
return max_col_width
[docs]
def find_color_name(line: str, minimal_color_length=2):
"""
Find the color name of a str
Args:
line (str): Line in which to find the color name
minimal_color_length (int): Minimum color length
Returns:
str: The color found int the line
"""
found_color = None
default_colors = get_color_names(min_color_length=minimal_color_length)
for color_name in default_colors:
if line.startswith(color_name):
found_color = color_name
break
return found_color
[docs]
def write_data_to_sheet_multiindex(
data_df: DataFrame, file_name: str | Path, sheet_name="Sheet"
):
"""
Write the data to Excel file with format
Args:
data_df (DataFrame): The dataframe to write to Excel
file_name (str): Name of the Excel file with format
sheet_name (str): Name of the sheet to write to
"""
with pd.ExcelWriter(file_name, engine="xlsxwriter") as writer:
data_df.to_excel(excel_writer=writer, sheet_name=sheet_name)
ExcelFormatter.header_style = None
workbook = writer.book
worksheet = writer.sheets[sheet_name]
wb = WorkBook(workbook=workbook)
# we now just reset the index such that we loop over column only
data_df.reset_index(inplace=True, allow_duplicates=True)
character_width = 1
start_row = 0
for col_idx, column_name in enumerate(data_df.columns):
col_width = get_max_width(
input_data=data_df, name=column_name, column_index=col_idx
)
_logger.info(f"Adjusting {column_name}/{col_idx} with width {col_width}")
align = wb.left_align
worksheet.set_column(
col_idx, col_idx, col_width * character_width, cell_format=align
)
worksheet.write(start_row, col_idx, column_name, wb.header_format)
for idx, value in enumerate(data_df[column_name]):
found_color_name = find_color_name(value)
if found_color_name is not None:
_logger.debug(f"Going to set {value} {found_color_name}")
cell_format = wb.set_format(found_color_name)
new_value = value.replace(found_color_name, "")
if cell_format is not None:
worksheet.write(idx + 1, col_idx, new_value, cell_format)
else:
_logger.debug("No color found")
worksheet.write(idx + 1, col_idx, new_value)
_logger.debug("Done")