In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.externals import joblib

tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
train = pd.read_csv("../train_medium.csv", usecols=["document_type", "body"])
valid = pd.read_csv("../validation_medium.csv", usecols=["document_type", "body"])
test = pd.read_csv("../test_medium.csv", usecols=["document_type", "body"])

In [4]:
def strip_trash(df, column="body"):
    df[column] = df[column].str.strip('{}"')
    return df

In [5]:
train_clean = strip_trash(train)
valid_clean = strip_trash(valid)
test_clean = strip_trash(test); len(train)

1466276

In [6]:
del(train)
del(valid)
del(test)

In [7]:
x_train, y_train = train_clean["body"].values, train_clean["document_type"].values

In [8]:
x_valid, y_valid = valid_clean["body"].values, valid_clean["document_type"].values

In [9]:
x_test, y_test = test_clean["body"].values, test_clean["document_type"].values

In [10]:
len(x_train), len(y_train), len(x_valid), len(y_valid), len(x_test), len(y_test)

(1466276, 1466276, 309608, 309608, 311015, 311015)

In [11]:
del(train_clean)
del(valid_clean)
del(test_clean)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost.sklearn import XGBClassifier

pipe_nb = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True,
                                   min_df=2, max_df=0.5)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_svc = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True,
                                   min_df=3, max_df=0.5)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))

# parameters_vectorizer = {
#     "vectorizer__ngram_range": [(1, 2)],
#     "vectorizer__sublinear_tf": [True],
#     "vectorizer__min_df": [1, 2, 3],
#     "vectorizer__max_df": [0.5, 0.8, 1.]
# }

# parametersSVC = {
#         "clf__penalty": ["l2"],
#         "clf__C": [0.03, 1, 3, 10],
#         "clf__class_weight": ["balanced"]
# }

# parametersSVC.update(parameters_vectorizer)

# parametersNB = {
#     "clf__alpha": [0.0001, 0.0003, 0.001],
#     "clf__fit_prior": [True]
# }

# parametersNB.update(parameters_vectorizer)

# parametersBoost = {
#     "clf__max_depth": [3, 4, 5],
#     "clf__learning_rate": [0.03, 0.1, 0.3],
#     "clf__n_estimators": [100, 300, 1000, 3000]
# }

# parametersBoost.update(parameters_vectorizer)

In [13]:
# from sklearn.model_selection import GridSearchCV, train_test_split

# experiment, _, experiment_labels, _ = train_test_split(x_valid, y_valid, test_size=0.98, random_state=42,
#                                                        stratify=y_valid)

In [14]:
# clf = GridSearchCV(estimator=pipe_svc, param_grid=parametersSVC, verbose=10, n_jobs=15, scoring="f1_macro")
# clf.fit(experiment, experiment_labels)

In [None]:
# print(clf.best_params_)
# clf.best_score_
# pipe_svc = clf.best_estimator_

pipe_svc.fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

preds_valid = pipe_svc.predict(x_valid)
print(classification_report(y_valid, preds_valid, target_names=pipe_svc.classes_, digits=4))
print(accuracy_score(y_valid, preds_valid))

In [None]:
preds_test = pipe_svc.predict(x_test)
print(classification_report(y_test, preds_test, target_names=pipe_svc.classes_, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
# clf = GridSearchCV(estimator=pipe_nb, param_grid=parametersNB, verbose=10, n_jobs=15, scoring="f1_macro")
# clf.fit(experiment, experiment_labels)

In [None]:
# print(clf.best_params_)
# clf.best_score_
# pipe_nb = clf.best_estimator_

pipe_nb.fit(x_train, y_train)

In [None]:
preds_valid = pipe_nb.predict(x_valid)
print(classification_report(y_valid, preds_valid, target_names=pipe_nb.classes_, digits=4))
print(accuracy_score(y_valid, preds_valid))

In [None]:
preds_test = pipe_nb.predict(x_test)
print(classification_report(y_test, preds_test, target_names=pipe_nb.classes_, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
joblib.dump(pipe_nb, './models/nb_clf_small.pkl')
joblib.dump(pipe_svc, './models/svc_clf_small.pkl')