In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

tqdm.pandas()

In [None]:
train = pd.read_csv("../train.csv")
valid = pd.read_csv("../validation.csv")
test = pd.read_csv("../test.csv")

In [None]:
train.head()

In [None]:
suit_count = len(train.drop_duplicates("process_id")) + len(valid.drop_duplicates("process_id")) + len(test.drop_duplicates("process_id"))
doc_count  = len(train.loc[train["pages"] == 1]) + len(valid.loc[valid["pages"] == 1 ]) + len(test.loc[test["pages"] == 1])
suit_count, doc_count

In [None]:
len(train) + len(valid) + len(test)

In [None]:
train_pages = train["document_type"].value_counts()
valid_pages = valid["document_type"].value_counts()
test_pages = test["document_type"].value_counts()
train_pages, valid_pages, test_pages

In [None]:
train_docs = train.loc[train["pages"] == 1]["document_type"].value_counts()
valid_docs = valid.loc[valid["pages"] == 1]["document_type"].value_counts()
test_docs = test.loc[test["pages"] == 1]["document_type"].value_counts()
train_docs, valid_docs, test_docs

In [None]:
def type_distribution_by_page(df):
    df_count = df.groupby("document_type").count()
    return pd.DataFrame(data= {"Category": df_count.index.values, "Pages": df_count.themes.values})

In [None]:
def type_distribution_by_document(df):
    df_count = df.loc[df["pages"] == 1].groupby("document_type").count()
    return pd.DataFrame(data= {"Category": df_count.index.values, "Documents": df_count.themes.values})

In [None]:
train_count_docs = type_distribution_by_document(train)
valid_count_docs = type_distribution_by_document(valid)
test_count_docs = type_distribution_by_document(test)

In [None]:
train_count_pages = type_distribution_by_page(train)
valid_count_pages = type_distribution_by_page(valid)
test_count_pages = type_distribution_by_page(test)

In [None]:
type_dist = pd.DataFrame()
type_dist["dataset"] = [" "]*len(train_count_pages)
type_dist["Category"] = train_count_pages.Category.values
type_dist = pd.merge(type_dist, train_count_docs[["Category", "Documents"]], on="Category", sort=False, how="left")
type_dist = pd.merge(type_dist, train_count_pages[["Category", "Pages"]], on="Category", sort=False, how="left")
type_dist = pd.merge(type_dist, valid_count_docs[["Category", "Documents"]], on="Category", sort=False, how="left")
type_dist = pd.merge(type_dist, valid_count_pages[["Category", "Pages"]], on="Category", sort=False, how="left")
type_dist = pd.merge(type_dist, test_count_docs[["Category", "Documents"]], on="Category", sort=False, how="left")
type_dist = pd.merge(type_dist, test_count_pages[["Category", "Pages"]], on="Category", sort=False, how="left")

In [None]:
pd.set_option("float_format", '{:,}'.format)
pd.set_option('precision', 0)
type_dist.loc[:, "Documents_x":] = type_dist.loc[:, "Documents_x":].apply(pd.to_numeric, downcast="float"); type_dist

In [None]:
print(type_dist.to_latex(index=False))

In [None]:
themes = {"5", "6", "26", "33", "139", "163", "232", "313", "339", "350", "406", "409", "555", "589",
         "597", "634", "660", "695", "729", "766", "773", "793", "800", "810", "852", "895", "951", "975"}

In [None]:
def replace_others(sentence):
    sentence = sentence.strip("[]")
    numbers = sentence.split(",")
    new_numbers = list({x.strip(" ") if x.strip(" ") in themes else "0" for x in numbers})
    return ",".join(new_numbers)

def rename_others(df, theme_col="themes"):
    df[theme_col] = df[theme_col].progress_apply(replace_others)
    return df

In [None]:
train = rename_others(train)
valid = rename_others(valid)
test = rename_others(test)

In [None]:
def disentangle_themes(df, lst_col="themes"):
    x = df.assign(**{lst_col:df[lst_col].str.split(",")})
    
    return pd.DataFrame({
            col:np.repeat(x[col].values, x[lst_col].str.len())
            for col in x.columns.difference([lst_col])
        }).assign(**{lst_col:np.concatenate(x[lst_col].values)})[x.columns.tolist()]
    

In [None]:
train = disentangle_themes(train)
valid = disentangle_themes(valid)
test = disentangle_themes(test)

In [None]:
def theme_distribution_by_process(df):
    df_count = df.drop_duplicates(["themes", "process_id"]).groupby("themes").count()
    df_count.index = pd.to_numeric(df_count.index)
    return pd.DataFrame(data= {"themes": df_count.index.values, "process_count": df_count.process_id.values}).sort_values("themes", axis=0).reset_index()

In [None]:
train_count_process = theme_distribution_by_process(train)

In [None]:
valid_count_process = theme_distribution_by_process(valid); valid_count_process

In [None]:
test_count_process = theme_distribution_by_process(test); test_count_process

In [None]:
def theme_distribution_by_page(df):
    df_count = df.groupby("themes").count()
    df_count.index = pd.to_numeric(df_count.index)
    return pd.DataFrame(data= {"themes": df_count.index.values, "process_count": df_count.process_id.values}).sort_values("themes", axis=0).reset_index()

In [None]:
train_count_pages = theme_distribution_by_page(train)

In [None]:
valid_count_pages = theme_distribution_by_page(valid)

In [None]:
test_count_pages = theme_distribution_by_page(test)

In [None]:
final_table = pd.DataFrame()
final_table["dataset"] = [" "]*len(train_count_pages); final_table

In [None]:
final_table["themes"] = train_count_process.themes.values; final_table

In [None]:
final_table = pd.merge(final_table, train_count_process[["process_count", "themes"]], on="themes", sort=False, how="left")
final_table

In [None]:
final_table = pd.merge(final_table, train_count_pages[["process_count", "themes"]], on="themes", sort=False, how="left").fillna(0)
final_table = pd.merge(final_table, valid_count_process[["process_count", "themes"]], on="themes", sort=False, how="left").fillna(0)
final_table = pd.merge(final_table, valid_count_pages[["process_count", "themes"]], on="themes", sort=False, how="left").fillna(0)
final_table = pd.merge(final_table, test_count_process[["process_count", "themes"]], on="themes", sort=False, how="left").fillna(0)
final_table = pd.merge(final_table, test_count_pages[["process_count", "themes"]], on="themes", sort=False, how="left").fillna(0)
final_table.columns=["dataset", "themes", "train_docs", "train_pages", "valid_docs", "valid_pages", "test_docs", "test_pages"]

In [None]:
# pd.set_option("float_format", '{:,}'.format)
# pd.set_option('precision', 0)
# final_table.loc[:, final_table.columns != "dataset"] = final_table.loc[:, final_table.columns != "dataset"].apply(pd.to_numeric, downcast="float")
final_table

In [None]:
print(final_table.to_latex(index=False))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

ax_themes = final_table.plot(x="themes", y=["train_docs", "valid_docs", "test_docs"], kind="bar",stacked=True, figsize=(8,6),
                fontsize=12)
ax_themes.legend(["Train set", "Validation set", "Test set"], prop={"size": 14})
ax_themes.set_xlabel("Themes", size=14)
ax_themes.set_ylabel("Suits", size=14)
ax_themes.set_title("Theme Distribution (Medium)", size=14)
ax_themes.get_figure().savefig("plots/medium_theme_distribution.pdf")