In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pickle
import pandas as pd
import numpy as np
from joblib import dump, load
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval

In [None]:
THEMES = [5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589,
          597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975]
TRAIN_DATA_PATH = '../train.csv'
TEST_DATA_PATH = '../test.csv'
VALIDATION_DATA_PATH = '../validation.csv'

In [None]:
def groupby_process(df):
    new_df = df.sort_values(['process_id', 'page'])
    new_df = new_df.groupby(
                ['process_id', 'themes'],
                group_keys=False
            ).apply(lambda x: x.body.str.cat(sep=' ')).reset_index()
    new_df = new_df.rename(index=str, columns={0: "body"})
    return new_df

def get_data(path, preds=None, key=None):
    data = pd.read_csv(path)
    data = data.rename(columns={ 'pages': 'page'})
    data = groupby_process(data)
    data.themes = data.themes.apply(lambda x: literal_eval(x))
    return data

def transform_y(train_labels, test_labels):
    mlb = MultiLabelBinarizer()
    mlb.fit(train_labels)

    mlb_train = mlb.transform(train_labels)
    mlb_test = mlb.transform(test_labels)

    print(mlb.classes_)

    return mlb_train, mlb_test, mlb

In [None]:
train_data = get_data(TRAIN_DATA_PATH)
test_data = get_data(TEST_DATA_PATH)
validation_data = get_data(VALIDATION_DATA_PATH)

train_data.themes = train_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
test_data.themes = test_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))

y_train, y_test, mlb = transform_y(train_data.themes, test_data.themes)

X_train = train_data.body
X_test = test_data.body
print('X_train: {}, \n\ty_train: {}'.format(X_train.shape, y_train.shape))
print('X_test: {}, \n\ty_test: {}'.format(X_test.shape, y_test.shape))
print('Classes: ', mlb.classes_)
print('We\'re classifying {} themes!'.format(y_train.shape[1]))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost.sklearn import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

pipe_nb = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True,
                                   min_df=0.1, max_features=10000)),
    ("clf", OneVsRestClassifier(MultinomialNB(alpha=0.001, fit_prior=True), n_jobs=-1))
))

pipe_svc = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True,
                                   min_df=0.1, max_features=10000)),
    ("clf", OneVsRestClassifier(LinearSVC(verbose=2, class_weight="balanced"), n_jobs=-1))
))

# parameters_vectorizer = {
#     "vectorizer__ngram_range": [(1, 2)],
#     "vectorizer__sublinear_tf": [True],
#     "vectorizer__min_df": [1, 2, 3],
#     "vectorizer__max_df": [0.5, 0.8, 1.]
# }

# parametersSVC = {
#         "clf__penalty": ["l2"],
#         "clf__C": [0.03, 1, 3, 10],
#         "clf__class_weight": ["balanced"]
# }

# parametersSVC.update(parameters_vectorizer)

# parametersNB = {
#     "clf__alpha": [0.0001, 0.0003, 0.001],
#     "clf__fit_prior": [True]
# }

# parametersNB.update(parameters_vectorizer)

# parametersBoost = {
#     "clf__max_depth": [3, 4, 5],
#     "clf__learning_rate": [0.03, 0.1, 0.3],
#     "clf__n_estimators": [100, 300, 1000, 3000]
# }

# parametersBoost.update(parameters_vectorizer)

In [None]:
# from sklearn.model_selection import GridSearchCV, train_test_split

# experiment, _, experiment_labels, _ = train_test_split(x_valid, y_valid, test_size=0.98, random_state=42,
#                                                        stratify=y_valid)

In [None]:
# clf = GridSearchCV(estimator=pipe_svc, param_grid=parametersSVC, verbose=10, n_jobs=15, scoring="f1_macro")
# clf.fit(experiment, experiment_labels)

In [None]:
# print(clf.best_params_)
# clf.best_score_
# pipe_svc = clf.best_estimator_

pipe_svc.fit(X_train, y_train)

In [None]:
target_names=[str(x) for x in mlb.classes_]

In [None]:
preds_test = pipe_svc.predict(X_test)
print(classification_report(y_test, preds_test, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
# clf = GridSearchCV(estimator=pipe_nb, param_grid=parametersNB, verbose=10, n_jobs=15, scoring="f1_macro")
# clf.fit(experiment, experiment_labels)

In [None]:
# print(clf.best_params_)
# clf.best_score_
# pipe_nb = clf.best_estimator_

pipe_nb.fit(X_train, y_joblib.dump(pipe_nb, './models/nb_clf_themes_medium.pkl')

In [None]:
preds_test = pipe_nb.predict(X_test)
print(classification_report(y_test, preds_test, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
from sklearn.externals import joblib

joblib.dump(pipe_nb, './models/nb_clf_themes_medium.pkl')
joblib.dump(pipe_svc, './models/svc_clf_themes_medium.pkl')

In [None]:
TRAIN_DATA_PATH = '../train_parts_19-03-2019_small.csv'
TEST_DATA_PATH = '../test_parts_19-03-2019_small.csv'
VALIDATION_DATA_PATH = '../validation_parts_19-03-2019_small.csv'

In [None]:
train_data = get_data(TRAIN_DATA_PATH)
test_data = get_data(TEST_DATA_PATH)
validation_data = get_data(VALIDATION_DATA_PATH)

train_data.themes = train_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
test_data.themes = test_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))

y_train, y_test, mlb = transform_y(train_data.themes, test_data.themes)

X_train = train_data.body
X_test = test_data.body
print('X_train: {}, \n\ty_train: {}'.format(X_train.shape, y_train.shape))
print('X_test: {}, \n\ty_test: {}'.format(X_test.shape, y_test.shape))
print('Classes: ', mlb.classes_)
print('We\'re classifying {} themes!'.format(y_train.shape[1]))

In [None]:
pipe_svc.fit(X_train, y_train)

In [None]:
target_names=[str(x) for x in mlb.classes_]

In [None]:
preds_test = pipe_svc.predict(X_test)
print(classification_report(y_test, preds_test, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
pipe_nb.fit(X_train, y_train)

In [None]:
preds_test = pipe_nb.predict(X_test)
print(classification_report(y_test, preds_test, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
joblib.dump(pipe_nb, './models/nb_clf_themes_small.pkl')
joblib.dump(pipe_svc, './models/svc_clf_themes_small.pkl')

In [None]:
from sklearn.externals import joblib

with open("./models/svc_clf_themes_small.pkl", "rb") as file:
    model = joblib.load(file)

In [None]:
def print_words_for_tag(classifier, tag, tags_classes, index_to_words):
    print('Tag:\\t{}'.format(tag))
    # Extract an estimator from the classifier for the given tag.\n",
    # Extract feature coefficients from the estimator. \n",
    coefs = classifier.coef_[tags_classes.index(tag)]
    sortedWords = [(index_to_words[x], coef) for coef,x in sorted(zip(coefs, range(len(coefs))))]
    top_positive_words = sortedWords[:-10:-1]
    top_negative_words = sortedWords[:10]
#     top_positive_words = # top-5 words sorted by the coefficiens.\n",
#     top_negative_words = # bottom-5 words  sorted by the coefficients.\n",
    print('Top positive words:\\t{}'.format(top_positive_words))
    print('Top negative words:\\t{}\\n'.format(top_negative_words))

In [None]:
clf = model.steps[1][1]
tfidf = model.steps[0][1]
print_words_for_tag(clf, 729, mlb.classes_.tolist(), {i:word for word,i in tfidf.vocabulary_.items()})