import import print_functionfrom sklearn.model_selection import train_test_splitfrom keras.models import

import pandas as pd# from bs4 import BeautifulSoup# import spacyimport stringimport reimport globimport numpy as npimport osimport numpy as npnp.random.seed(1337)# from __future__ import print_functionfrom sklearn.model_selection import train_test_splitfrom keras.models import load_modelfrom keras.preprocessing.text import Tokenizerfrom keras.preprocessing.sequence import pad_sequencesfrom keras.layers import Dense, Input, Dropoutfrom keras.layers import Convolution1D, MaxPooling1D, Embedding, LSTM, Activation#from keras.layers.advanced_activations import PReLUfrom keras.models import Sequential, Model#from keras.callbacks import Callbackfrom sklearn.metrics import matthews_corrcoeffrom sklearn.metrics import hamming_loss, f1_score, confusion_matrix, precision_score, recall_scoreimport sysfrom joblib import Parallel, delayedimport pickleimport csv”’class Metrics(Callback):    def on_train_begin(self, logs={}):        val_predict = (np.asarray(self.model.predict(self.model.validation_data0))).round()        val_targ = self.model.validation_data1        _val_f1 = f1.score(val_targ, val_predict)        _val_recall = recall_score(val_targ, val_predict)        _val_precision = precision_score(val_targ, val_predict)        self.val_f1s.append(_val_f1)        self.val_precisions.append(_val_precision)        self.val_recalls.append(_val_recall)        print “-val_f1 : %f, -val_precision : %f, -val_recall : %f” % (_val_f1, _val_precision, _val_recall)        return”’if __name__ == ‘__main__’:    MAX_NB_WORDS = 100000    MAX_SEQUENCE_LENGTH = 100    VALIDATION_SPLIT = 0.10    EMBEDDING_DIM = 100    dataframe = pd.read_csv(“data/Train.csv”) # in /data folder    dataframe = dataframe.dropna(thresh = 5)    dataframe_test = pd.read_csv( “data/Test.csv”) # in /data folder    dataframe_test = dataframe_test.dropna(thresh = 3)    dataframe”processed” = dataframe”title” + dataframe”content”    dataframe_test”processed” = dataframe_test”title” + dataframe_test”content”    texts = list(dataframe”processed”)    texts_test = list(dataframe_test’processed’)    ids_test = dataframe_test”id”.tolist()    print(“Preparing train & test sets.”)    tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)    tokenizer.fit_on_texts(texts)    word_index = tokenizer.word_index    sequences = tokenizer.texts_to_sequences(texts)    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)    labeller = preprocessing.MultiLabelBinarizer()    labels = labeller.fit_transform(dataframe”tags”)    nb_validation_samples = int(VALIDATION_SPLIT * data.shape0)    x_train = data:-nb_validation_samples    y_train = labels:-nb_validation_samples    x_val = data-nb_validation_samples:    y_val = labels-nb_validation_samples:    print len(x_train), len(y_train), len(x_val), len(y_val)    x_test = pad_sequences(tokenizer.texts_to_sequences(texts_test), maxlen=MAX_SEQUENCE_LENGTH)    print(‘Indexing word vectors for glove file.’)    embeddings_index = {} # for word vectors    f = open(‘glove.6B.100d.txt’) # to be downloaded    for line in f:        values = line.split()        word = values0        coefs = np.asarray(values1:, dtype=’float32′)        embeddings_indexword = coefs    f.close()    print(‘Total word vectors are %s’ % len(embeddings_index))    print(‘Embedding matrix formation’)    nb_words = min(MAX_NB_WORDS, len(word_index))    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))    for word, i in word_index.items():        if i > MAX_NB_WORDS:            continue        embedding_vector = embeddings_index.get(word)        if embedding_vector is not None:            embedding_matrixi = embedding_vector    embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=embedding_matrix, input_length=MAX_SEQUENCE_LENGTH, trainable=False)    print(‘Embedding Matrix prepared’)    print(‘Training started’)    metrics = Metrics()    model = Sequential()    model.add(embedding_layer)    model.add(Convolution1D(nb_filter=512,                            filter_length=7,                            border_mode=’valid’,                            activation=’relu’,                            subsample_length=1))    model.add(MaxPooling1D(pool_length=6))    model.add(Dropout(0.3))    model.add(Convolution1D(nb_filter=256,                            filter_length=6,                            border_mode=’valid’,                            activation=’relu’,                            subsample_length=1))    model.add(MaxPooling1D(pool_length=5))    model.add(Dropout(0.3))    model.add(LSTM(100))    model.add(Dense(300, activation=’relu’))    model.add(Dropout(0.3))    model.add(Dense(len(labeller.classes_), activation=’relu’))    model.compile(loss=’mean_squared_error’, optimizer=’adam’, metrics=’acc’, ‘mae’)    model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=30, batch_size=128, verbose=1)#, callbacks = metrics)    model.save(‘model_keras’)    model = load_model(‘trained_model’)    print(‘Training finished’)    threshold = np.arange(0,0.02,0.00025)    out = model.predict(x_val)    out = np.array(out)    def bestThreshold(y_prob, threshold, i):        acc =         for j in threshold:            y_pred = np.greater_equal(y_prob, j)*1            acc.append(matthews_corrcoef(y_val:,i, y_pred))        acc = np.array(acc)        index = np.where(acc==acc.max())        return thresholdindex00    best_threshold = Parallel(n_jobs=4, verbose=1)(delayed(bestThreshold)(out:,i, threshold, i) for i in range(out.shape1))    y_pred = np.greater_equal(out, np.array(best_threshold)).astype(np.int8)    hamming_new = hamming_loss(y_pred, y_val)    try:        hamming_old = pickle.load(open(‘best_hamming_loss’, “rb”))    except:        hamming_old = 1    print(‘Model error’s, less is better’)    print(‘New Error: ‘ + str(hamming_new))    print(‘Old Error: ‘ + str(hamming_old))    pickle.dump(best_threshold, open(‘best_threshold’, “wb”))    best_threshold = pickle.load(open(‘best_threshold’, “rb”))    if hamming_new < hamming_old:        pickle.dump(hamming_new, open('best_hamming_loss', "wb"))    model.save('trained_model')    print('Start prediction')    test_probs = model.predict(x_test, batch_size=256, verbose=1)    print(x_test, test_probs)    print('Getting result vector using best_threshold')    y_pred = np.greater_equal(test_probs, np.array(best_threshold))    def getTagFromVector(y):        return ' '.join(y)    y_tags_temp = labeller.inverse_transform(y_pred)    y_tags = Parallel(n_jobs=4, verbose=1)(delayed(getTagFromVector)(y) for y in y_tags_temp)