Preprocesamiento y Creación de Modelos de clasificación

Preprocesamiento y Creación de Modelos de clasificación#

import warnings
warnings.filterwarnings('ignore')

Preprocesamiento#

import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from gensim.models.doc2vec import TaggedDocument
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

df = pd.read_csv('https://raw.githubusercontent.com/lihkir/Data/main/spam_text_class.csv',delimiter=',',encoding='latin-1')

df = df[['Category','Message']]

df = df[pd.notnull(df['Message'])]

# quitar etiquetas html

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

df['Message'] = df['Message'].apply(cleanText)

train_test split

max_fatures = 500000

MAX_SEQUENCE_LENGTH = 50

tokenizer = Tokenizer(num_words=max_fatures, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Message'].values)

X = tokenizer.texts_to_sequences(df['Message'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

Y = pd.get_dummies(df['Category']).values

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)

SMOTE#

from imblearn.over_sampling import SMOTE

# Crear una instancia de SMOTE
smote = SMOTE(random_state=42)

y_for_smote = np.argmax(y_train, axis=1)

# Aplicar SMOTE al conjunto de entrenamiento
X_train_smote, y_for_smote = smote.fit_resample(X_train, y_train)

# Get unique values in the array
unique_values = np.unique(y_for_smote)

# Initialize one-hot encoded array with zeros
y_train_smote = np.zeros((len(y_for_smote), len(unique_values)))

# Fill in the one-hot array based on the original binary array
for i, value in enumerate(y_for_smote):

    y_train_smote[i, np.where(unique_values == value)] = 1

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Chart 1
plt.subplot(1, 2, 1)
pd.DataFrame(np.argmax(y_train, axis=1).astype(str)).value_counts().plot(kind='bar', title='Correos por clase Antes del SMOTE')

# Chart 2
plt.subplot(1, 2, 2)
pd.DataFrame(np.argmax(y_train_smote, axis=1).astype(str)).value_counts().plot(kind='bar', title='Correos por clase después del SMOTE')

plt.tight_layout()  # To ensure proper spacing between subplots
plt.show()

_images/1e8a129d4de7a8c0fe4f3c45ce1a59c6fd28a462cb195e69fd82c47da325c075.png

Creación y Comparación de Modelos#

LSTM#

train, test = train_test_split(df, test_size=0.000001 , random_state=42)

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) <= 0:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['Message']), tags=[r.Category]), axis=1)

test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['Message']), tags=[r.Category]), axis=1)

d2v_model = Doc2Vec(dm=1, dm_mean=1, vector_size=20, window=8, min_count=1, workers=1, alpha=0.065, min_alpha=0.065)

d2v_model.build_vocab([x for x in tqdm(train_tagged.values)])

for epoch in range(30):
    d2v_model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v_model.alpha -= 0.002
    d2v_model.min_alpha = d2v_model.alpha

100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 929010.32it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 506326.63it/s]
100%|██████████| 5571/5571 [00:00<00:00, 369547.17it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356228.73it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 355973.67it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356614.74it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356505.92it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 308671.96it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 319397.30it/s]
100%|██████████| 5571/5571 [00:00<00:00, 294050.99it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356473.29it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 1231045.13it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356723.62it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356098.44it/s]
100%|██████████| 5571/5571 [00:00<00:00, 336828.51it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 10733333.75it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]

Numero de palabras totales que queda en vector de embeddings

len(d2v_model.wv.index_to_key)

top 10 parabras mas similares a ‘urgent’

d2v_model.wv.most_similar(positive=['urgent'], topn=10)

[('aint', 0.754719078540802),
 ('cherish', 0.753497302532196),
 ('pee', 0.7372303605079651),
 ('plate', 0.7127653956413269),
 ('09066612661', 0.7108734250068665),
 ('09061213237', 0.7107982635498047),
 ('wrld', 0.7033034563064575),
 ('pig', 0.7011724710464478),
 ('zogtorius', 0.6995012164115906),
 ('lower', 0.6984557509422302)]

from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers.legacy import Adam

# matriz de 9363 x 20
embedding_matrix = np.zeros((len(d2v_model.wv.index_to_key)+ 1, 20))

model = Sequential()

model.add(Embedding(len(d2v_model.wv.index_to_key)+1,20,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

model.add(LSTM(50,return_sequences=False))

model.add(Dense(2,activation="softmax"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 50, 20)            187280    
                                                                 
 lstm (LSTM)                 (None, 50)                14200     
                                                                 
 dense (Dense)               (None, 2)                 102       
                                                                 
=================================================================
Total params: 201,582
Trainable params: 201,582
Non-trainable params: 0
_________________________________________________________________

model.compile(optimizer=Adam(),loss="binary_crossentropy",metrics = ['accuracy'])

batch_size = 32

history = model.fit(X_train, y_train, epochs =50, batch_size=batch_size, verbose = 2)

Epoch 1/50
148/148 - 10s - loss: 0.2749 - accuracy: 0.9174 - 10s/epoch - 65ms/step
Epoch 2/50
148/148 - 5s - loss: 0.0509 - accuracy: 0.9873 - 5s/epoch - 33ms/step
Epoch 3/50
148/148 - 5s - loss: 0.0228 - accuracy: 0.9949 - 5s/epoch - 33ms/step
Epoch 4/50
148/148 - 4s - loss: 0.0145 - accuracy: 0.9970 - 4s/epoch - 30ms/step
Epoch 5/50
148/148 - 3s - loss: 0.0063 - accuracy: 0.9987 - 3s/epoch - 21ms/step
Epoch 6/50
148/148 - 3s - loss: 0.0030 - accuracy: 0.9992 - 3s/epoch - 22ms/step
Epoch 7/50
148/148 - 3s - loss: 0.0018 - accuracy: 0.9994 - 3s/epoch - 21ms/step
Epoch 8/50
148/148 - 3s - loss: 0.0016 - accuracy: 0.9996 - 3s/epoch - 21ms/step
Epoch 9/50
148/148 - 3s - loss: 0.0010 - accuracy: 0.9996 - 3s/epoch - 21ms/step
Epoch 10/50
148/148 - 3s - loss: 7.1594e-04 - accuracy: 1.0000 - 3s/epoch - 21ms/step
Epoch 11/50
148/148 - 3s - loss: 5.2601e-04 - accuracy: 1.0000 - 3s/epoch - 21ms/step
Epoch 12/50
148/148 - 3s - loss: 0.0052 - accuracy: 0.9989 - 3s/epoch - 22ms/step
Epoch 13/50
148/148 - 3s - loss: 0.0039 - accuracy: 0.9994 - 3s/epoch - 21ms/step
Epoch 14/50
148/148 - 3s - loss: 4.8190e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 15/50
148/148 - 3s - loss: 3.1171e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 16/50
148/148 - 3s - loss: 2.2317e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 17/50
148/148 - 3s - loss: 1.8234e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 18/50
148/148 - 3s - loss: 1.4445e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 19/50
148/148 - 3s - loss: 1.2356e-04 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 20/50
148/148 - 3s - loss: 9.9602e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 21/50
148/148 - 3s - loss: 8.4726e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 22/50
148/148 - 3s - loss: 7.3875e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 23/50
148/148 - 3s - loss: 6.2928e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 24/50
148/148 - 3s - loss: 5.8087e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 25/50
148/148 - 3s - loss: 4.8064e-05 - accuracy: 1.0000 - 3s/epoch - 21ms/step
Epoch 26/50
148/148 - 3s - loss: 4.3706e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 27/50
148/148 - 4s - loss: 4.3440e-05 - accuracy: 1.0000 - 4s/epoch - 24ms/step
Epoch 28/50
148/148 - 3s - loss: 3.4584e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 29/50
148/148 - 3s - loss: 3.0517e-05 - accuracy: 1.0000 - 3s/epoch - 20ms/step
Epoch 30/50
148/148 - 3s - loss: 2.7889e-05 - accuracy: 1.0000 - 3s/epoch - 21ms/step
Epoch 31/50
148/148 - 3s - loss: 2.3997e-05 - accuracy: 1.0000 - 3s/epoch - 20ms/step
Epoch 32/50
148/148 - 3s - loss: 2.1514e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 33/50
148/148 - 3s - loss: 1.9524e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 34/50
148/148 - 3s - loss: 1.7450e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 35/50
148/148 - 3s - loss: 1.6170e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 36/50
148/148 - 3s - loss: 1.4798e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 37/50
148/148 - 3s - loss: 1.3372e-05 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 38/50
148/148 - 3s - loss: 1.2211e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 39/50
148/148 - 3s - loss: 1.1010e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 40/50
148/148 - 3s - loss: 1.0638e-05 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 41/50
148/148 - 3s - loss: 9.4834e-06 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 42/50
148/148 - 3s - loss: 8.5392e-06 - accuracy: 1.0000 - 3s/epoch - 22ms/step
Epoch 43/50
148/148 - 4s - loss: 7.7573e-06 - accuracy: 1.0000 - 4s/epoch - 24ms/step
Epoch 44/50
148/148 - 4s - loss: 7.1510e-06 - accuracy: 1.0000 - 4s/epoch - 24ms/step
Epoch 45/50
148/148 - 3s - loss: 6.4189e-06 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 46/50
148/148 - 3s - loss: 5.9673e-06 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 47/50
148/148 - 3s - loss: 5.4678e-06 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 48/50
148/148 - 3s - loss: 5.0584e-06 - accuracy: 1.0000 - 3s/epoch - 24ms/step
Epoch 49/50
148/148 - 3s - loss: 4.6666e-06 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 50/50
148/148 - 3s - loss: 4.4344e-06 - accuracy: 1.0000 - 3s/epoch - 21ms/step

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

y_pred = model.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

y_test = y_test[:, 1].astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

27/27 [==============================] - 1s 11ms/step
Accuracy: 0.99
Precision: 0.98
Recall: 0.93
F1 Score: 0.96
AUC: 0.97

_images/2d172f4038fb250df206a2e3ae8340a663baf385cee0c689f4fc17e8b03e1df4.png

LSTM con SMOTE#

# matriz de 9363 x 20
embedding_matrix = np.zeros((len(d2v_model.wv.index_to_key)+ 1, 20))

model_2 = Sequential()

model_2.add(Embedding(len(d2v_model.wv.index_to_key)+1,20,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

model_2.add(LSTM(50,return_sequences=False))

model_2.add(Dense(2,activation="softmax"))

model_2.compile(optimizer=Adam(),loss="binary_crossentropy",metrics = ['accuracy'])

batch_size = 32

history_2 = model_2.fit(X_train_smote, y_train_smote, epochs =50, batch_size=batch_size, verbose = 2)

Epoch 1/50
256/256 - 8s - loss: 0.3630 - accuracy: 0.8361 - 8s/epoch - 33ms/step
Epoch 2/50
256/256 - 6s - loss: 0.1362 - accuracy: 0.9559 - 6s/epoch - 23ms/step
Epoch 3/50
256/256 - 6s - loss: 0.0779 - accuracy: 0.9779 - 6s/epoch - 23ms/step
Epoch 4/50
256/256 - 6s - loss: 0.0452 - accuracy: 0.9878 - 6s/epoch - 23ms/step
Epoch 5/50
256/256 - 6s - loss: 0.0274 - accuracy: 0.9932 - 6s/epoch - 23ms/step
Epoch 6/50
256/256 - 6s - loss: 0.0188 - accuracy: 0.9951 - 6s/epoch - 22ms/step
Epoch 7/50
256/256 - 5s - loss: 0.0126 - accuracy: 0.9963 - 5s/epoch - 21ms/step
Epoch 8/50
256/256 - 5s - loss: 0.0063 - accuracy: 0.9982 - 5s/epoch - 21ms/step
Epoch 9/50
256/256 - 5s - loss: 0.0053 - accuracy: 0.9985 - 5s/epoch - 21ms/step
Epoch 10/50
256/256 - 5s - loss: 0.0033 - accuracy: 0.9990 - 5s/epoch - 21ms/step
Epoch 11/50
256/256 - 5s - loss: 0.0037 - accuracy: 0.9991 - 5s/epoch - 21ms/step
Epoch 12/50
256/256 - 5s - loss: 0.0043 - accuracy: 0.9989 - 5s/epoch - 21ms/step
Epoch 13/50
256/256 - 5s - loss: 0.0120 - accuracy: 0.9967 - 5s/epoch - 21ms/step
Epoch 14/50
256/256 - 5s - loss: 0.0035 - accuracy: 0.9988 - 5s/epoch - 21ms/step
Epoch 15/50
256/256 - 5s - loss: 0.0013 - accuracy: 0.9998 - 5s/epoch - 21ms/step
Epoch 16/50
256/256 - 5s - loss: 8.1742e-04 - accuracy: 0.9998 - 5s/epoch - 21ms/step
Epoch 17/50
256/256 - 5s - loss: 5.7329e-04 - accuracy: 0.9999 - 5s/epoch - 21ms/step
Epoch 18/50
256/256 - 5s - loss: 4.6134e-04 - accuracy: 0.9999 - 5s/epoch - 21ms/step
Epoch 19/50
256/256 - 5s - loss: 3.4523e-04 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 20/50
256/256 - 5s - loss: 2.6883e-04 - accuracy: 0.9999 - 5s/epoch - 21ms/step
Epoch 21/50
256/256 - 5s - loss: 2.2418e-04 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 22/50
256/256 - 5s - loss: 1.5771e-04 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 23/50
256/256 - 5s - loss: 1.3357e-04 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 24/50
256/256 - 5s - loss: 1.1690e-04 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 25/50
256/256 - 5s - loss: 7.7140e-05 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 26/50
256/256 - 5s - loss: 6.9837e-05 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 27/50
256/256 - 5s - loss: 5.4699e-05 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 28/50
256/256 - 5s - loss: 4.6409e-05 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 29/50
256/256 - 5s - loss: 4.0961e-05 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 30/50
256/256 - 6s - loss: 3.0426e-05 - accuracy: 1.0000 - 6s/epoch - 22ms/step
Epoch 31/50
256/256 - 6s - loss: 2.4787e-05 - accuracy: 1.0000 - 6s/epoch - 23ms/step
Epoch 32/50
256/256 - 6s - loss: 2.0901e-05 - accuracy: 1.0000 - 6s/epoch - 22ms/step
Epoch 33/50
256/256 - 6s - loss: 1.6103e-05 - accuracy: 1.0000 - 6s/epoch - 23ms/step
Epoch 34/50
256/256 - 6s - loss: 1.4357e-05 - accuracy: 1.0000 - 6s/epoch - 22ms/step
Epoch 35/50
256/256 - 6s - loss: 1.2146e-05 - accuracy: 1.0000 - 6s/epoch - 22ms/step
Epoch 36/50
256/256 - 5s - loss: 1.0225e-05 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 37/50
256/256 - 5s - loss: 8.4397e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 38/50
256/256 - 5s - loss: 7.2955e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 39/50
256/256 - 5s - loss: 6.3138e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 40/50
256/256 - 5s - loss: 5.3885e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 41/50
256/256 - 5s - loss: 4.5613e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 42/50
256/256 - 5s - loss: 3.9344e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 43/50
256/256 - 5s - loss: 3.3964e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 44/50
256/256 - 5s - loss: 2.8711e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 45/50
256/256 - 5s - loss: 2.4272e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 46/50
256/256 - 5s - loss: 2.1240e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 47/50
256/256 - 5s - loss: 1.8127e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 48/50
256/256 - 5s - loss: 1.6042e-06 - accuracy: 1.0000 - 5s/epoch - 21ms/step
Epoch 49/50
256/256 - 5s - loss: 1.3917e-06 - accuracy: 1.0000 - 5s/epoch - 20ms/step
Epoch 50/50
256/256 - 5s - loss: 1.1890e-06 - accuracy: 1.0000 - 5s/epoch - 21ms/step

y_pred = model_2.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

27/27 [==============================] - 1s 11ms/step
Accuracy: 0.91
Precision: 0.58
Recall: 0.95
F1 Score: 0.72
AUC: 0.93

_images/02ecd5d70687db4c88374b100d06914ba11f1474a6197b4c0cc509b64418aaf0.png

KNN#

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the kNN classifier
knn_classifier = KNeighborsClassifier()

# Define the hyperparameters and their possible values
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}

# Create a F1 scorer for grid search
f1_scorer = make_scorer(f1_score)

# Create GridSearchCV object
grid_search = GridSearchCV(knn_classifier, param_grid, scoring=f1_scorer, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_knn_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_knn_model.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate F1 Score on the test set
f1score = f1_score(y_test, y_pred)
print(f'F1 Score on Test Set: {f1score:.2f}')

Best Hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}
F1 Score on Test Set: 0.48

from sklearn.neighbors import KNeighborsClassifier

# Create a kNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# Train the classifier on the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn_classifier.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.88
Precision: 0.55
Recall: 0.43
F1 Score: 0.48
AUC: 0.69

_images/68d6a3f308881be1670321d8417097343619e6627f279d7390e8b0131db0d808.png

KNN con SMOTE#

# Create a kNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# Train the classifier on the training data
knn_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the testing data
y_pred = knn_classifier.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.82
Precision: 0.39
Recall: 0.71
F1 Score: 0.50
AUC: 0.77

_images/dbeb71ceb786f03078e7a2505ff6cfcdf6d1da2ba42f6f8a661c883df572408b.png

Ridge Classifier#

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the Ridge classifier
ridge_classifier = RidgeClassifier()

# Define the hyperparameters and their possible values
param_grid = {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}

# Create a F1 scorer for grid search
f1_scorer = make_scorer(f1_score)

# Create GridSearchCV object
grid_search = GridSearchCV(ridge_classifier, param_grid, scoring=f1_scorer, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_ridge_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_ridge_model.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate F1 Score on the test set
f1score = f1_score(y_test, y_pred)
print(f'F1 Score on Test Set: {f1score:.2f}')

Best Hyperparameters: {'alpha': 0.1, 'solver': 'auto'}
F1 Score on Test Set: 0.19

ridge_classifier = RidgeClassifier(alpha = 0.1, solver = 'auto')

# Train the classifier on the training data
ridge_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = ridge_classifier.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.87
Precision: 0.42
Recall: 0.12
F1 Score: 0.19
AUC: 0.55

_images/5279f5eb88be4c0640803c827c638ab446b4d523f0d33613af3dc3534a260903.png

Ridge Classifier con SMOTE#

ridge_classifier = RidgeClassifier(alpha = 0.1, solver = 'auto')

# Train the classifier on the training data
ridge_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the testing data
y_pred = ridge_classifier.predict(X_test)
y_pred = (y_pred[:, 1] > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.78
Precision: 0.33
Recall: 0.65
F1 Score: 0.44
AUC: 0.73

_images/e8d796e52cc9d96bb2940cbb2cd683dfb8c35d8f896195330d599c946149c6e6.png

Naive Bayes Classifier#

from sklearn.naive_bayes import MultinomialNB

# Define the Naive Bayes model
naive_bayes = MultinomialNB()

# Define the parameter grid to search
param_grid = {
    'alpha': np.linspace(0.1, 1.0, 10),  # Adjust as needed
    'fit_prior': [True, False]
}

# Create F1 scorer
f1_scorer = make_scorer(f1_score)

# Initialize GridSearchCV
grid_search = GridSearchCV(naive_bayes, param_grid, cv=5, scoring=f1_scorer)

# Fit the grid search to the data
grid_search.fit(X_train, y_train[:, 1].astype(int))

# Print the best parameters and corresponding F1 score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Best Hyperparameters: {'alpha': 0.1, 'fit_prior': True}
Best F1 Score: 0.43899155529615747

# Create a kNN classifier with k=3
naive_bayes = MultinomialNB(alpha=0.1, fit_prior=True)

# Train the classifier on the training data
naive_bayes.fit(X_train, y_train[:, 1].astype(int))

# Make predictions on the testing data
y_pred = naive_bayes.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.79
Precision: 0.33
Recall: 0.65
F1 Score: 0.44
AUC: 0.73

_images/83645afbf77836c303f7455757f038006a4b9b3dc16e2bad7139e7b9bb2dc74a.png

Naiva Bayes Classifier con SMOTE#

# Create a kNN classifier with k=3
naive_bayes = MultinomialNB(alpha=0.1, fit_prior=True)

# Train the classifier on the training data
naive_bayes.fit(X_train_smote, y_train_smote[:, 1].astype(int))

# Make predictions on the testing data
y_pred = naive_bayes.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.79
Precision: 0.35
Recall: 0.71
F1 Score: 0.47
AUC: 0.76

_images/8b25377c15b28cc05c243986b686f9eb8ef84446aa0a44e207dd4447c8c06d0a.png

Random Forest Classifier#

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score

# Define the Random Forest model
rf_model = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

# Create F1 scorer
f1_scorer = make_scorer(f1_score)

# Initialize GridSearchCV
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring=f1_scorer)

# Fit the grid search to the data
grid_search.fit(X_train, y_train[:, 1].astype(int))

# Print the best parameters and corresponding F1 score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Best Hyperparameters: {'class_weight': None, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best F1 Score: 0.6555674195944813

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rf_model = RandomForestClassifier(n_estimators=200,
                                  max_depth=7,
                                  min_samples_leaf = 1,
                                  min_samples_split = 5,
                                  class_weight = None)

# Train the classifier on the training data
rf_model.fit(X_train, y_train[:, 1].astype(int))

# Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# Get unique values in the array
unique_values = np.unique(y_pred_rf)
# Initialize one-hot encoded array with zeros
y_pred = np.zeros((len(y_pred_rf), len(unique_values)))
# Fill in the one-hot array based on the original binary array
for i, value in enumerate(y_pred_rf):
    y_pred[i, np.where(unique_values == value)] = 1

# Calculate accuracy
accuracy = accuracy_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.93
Precision: 0.87
Recall: 0.51
F1 Score: 0.65
AUC: 0.75

_images/1814ab19c778728584066376937302e71b050bc9ed9e0bd6ca4a02e70b6cda46.png

Random Forest Classifier con SMOTE#

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

rf_model = RandomForestClassifier(n_estimators=200,
                                  max_depth=7,
                                  min_samples_leaf = 1,
                                  min_samples_split = 5,
                                  class_weight = None)

# Train the classifier on the training data
rf_model.fit(X_train_smote, y_train_smote[:, 1].astype(int))

# Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# Get unique values in the array
unique_values = np.unique(y_pred_rf)
# Initialize one-hot encoded array with zeros
y_pred = np.zeros((len(y_pred_rf), len(unique_values)))
# Fill in the one-hot array based on the original binary array
for i, value in enumerate(y_pred_rf):
    y_pred[i, np.where(unique_values == value)] = 1

# Calculate accuracy
accuracy = accuracy_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test[:, 1].astype(int), np.argmax(y_pred, axis=1))
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.86
Precision: 0.48
Recall: 0.80
F1 Score: 0.60
AUC: 0.84

_images/93356fd9e71aa91559718c4db6e321b26b507f543cfd2f705c520f5d71582eb3.png

Nueva propuesta: Stacked model#

from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.neighbors import KNeighborsClassifier

X_trainval, X_test, y_trainval, y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)

X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)

################## LSTM ##################

# matriz de 9363 x 20
embedding_matrix = np.zeros((len(d2v_model.wv.index_to_key)+ 1, 20))

model = Sequential()

model.add(Embedding(len(d2v_model.wv.index_to_key)+1,20,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

model.add(LSTM(50,return_sequences=False))

model.add(Dense(2,activation="softmax"))

model.compile(optimizer=Adam(),loss="binary_crossentropy",metrics = ['accuracy'])

batch_size = 32

history = model.fit(X_train, y_train, epochs =50, batch_size=batch_size, verbose = 2)

################## KNN ############################

# Create a kNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# Train the classifier on the training data
knn_classifier.fit(X_train, y_train)

Epoch 1/50
119/119 - 6s - loss: 0.3772 - accuracy: 0.8738 - 6s/epoch - 49ms/step
Epoch 2/50
119/119 - 3s - loss: 0.0801 - accuracy: 0.9810 - 3s/epoch - 27ms/step
Epoch 3/50
119/119 - 3s - loss: 0.0316 - accuracy: 0.9916 - 3s/epoch - 27ms/step
Epoch 4/50
119/119 - 3s - loss: 0.0154 - accuracy: 0.9968 - 3s/epoch - 27ms/step
Epoch 5/50
119/119 - 3s - loss: 0.0098 - accuracy: 0.9974 - 3s/epoch - 26ms/step
Epoch 6/50
119/119 - 3s - loss: 0.0054 - accuracy: 0.9987 - 3s/epoch - 25ms/step
Epoch 7/50
119/119 - 3s - loss: 0.0042 - accuracy: 0.9989 - 3s/epoch - 25ms/step
Epoch 8/50
119/119 - 3s - loss: 0.0041 - accuracy: 0.9992 - 3s/epoch - 25ms/step
Epoch 9/50
119/119 - 3s - loss: 0.0028 - accuracy: 0.9995 - 3s/epoch - 23ms/step
Epoch 10/50
119/119 - 3s - loss: 0.0013 - accuracy: 0.9997 - 3s/epoch - 23ms/step
Epoch 11/50
119/119 - 3s - loss: 0.0014 - accuracy: 0.9997 - 3s/epoch - 23ms/step
Epoch 12/50
119/119 - 3s - loss: 9.4009e-04 - accuracy: 0.9997 - 3s/epoch - 23ms/step
Epoch 13/50
119/119 - 3s - loss: 7.3789e-04 - accuracy: 0.9997 - 3s/epoch - 23ms/step
Epoch 14/50
119/119 - 3s - loss: 5.4639e-04 - accuracy: 0.9997 - 3s/epoch - 23ms/step
Epoch 15/50
119/119 - 3s - loss: 6.1255e-04 - accuracy: 0.9997 - 3s/epoch - 24ms/step
Epoch 16/50
119/119 - 3s - loss: 3.8961e-04 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 17/50
119/119 - 3s - loss: 3.0204e-04 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 18/50
119/119 - 3s - loss: 2.7991e-04 - accuracy: 1.0000 - 3s/epoch - 25ms/step
Epoch 19/50
119/119 - 3s - loss: 2.7193e-04 - accuracy: 1.0000 - 3s/epoch - 25ms/step
Epoch 20/50
119/119 - 3s - loss: 1.9108e-04 - accuracy: 1.0000 - 3s/epoch - 23ms/step
Epoch 21/50
119/119 - 3s - loss: 1.7832e-04 - accuracy: 1.0000 - 3s/epoch - 24ms/step
Epoch 22/50
119/119 - 3s - loss: 1.3178e-04 - accuracy: 1.0000 - 3s/epoch - 24ms/step
Epoch 23/50
119/119 - 3s - loss: 1.1231e-04 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 24/50
119/119 - 3s - loss: 1.0353e-04 - accuracy: 1.0000 - 3s/epoch - 28ms/step
Epoch 25/50
119/119 - 3s - loss: 9.9223e-05 - accuracy: 1.0000 - 3s/epoch - 27ms/step
Epoch 26/50
119/119 - 3s - loss: 7.6312e-05 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 27/50
119/119 - 3s - loss: 6.6245e-05 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 28/50
119/119 - 3s - loss: 6.7231e-05 - accuracy: 1.0000 - 3s/epoch - 27ms/step
Epoch 29/50
119/119 - 3s - loss: 4.9366e-05 - accuracy: 1.0000 - 3s/epoch - 25ms/step
Epoch 30/50
119/119 - 3s - loss: 4.4229e-05 - accuracy: 1.0000 - 3s/epoch - 27ms/step
Epoch 31/50
119/119 - 3s - loss: 3.8014e-05 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 32/50
119/119 - 3s - loss: 3.3968e-05 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 33/50
119/119 - 3s - loss: 4.0132e-05 - accuracy: 1.0000 - 3s/epoch - 25ms/step
Epoch 34/50
119/119 - 3s - loss: 0.0569 - accuracy: 0.9926 - 3s/epoch - 26ms/step
Epoch 35/50
119/119 - 3s - loss: 0.0075 - accuracy: 0.9974 - 3s/epoch - 28ms/step
Epoch 36/50
119/119 - 3s - loss: 0.0020 - accuracy: 0.9995 - 3s/epoch - 24ms/step
Epoch 37/50
119/119 - 3s - loss: 0.0022 - accuracy: 0.9995 - 3s/epoch - 24ms/step
Epoch 38/50
119/119 - 3s - loss: 9.3576e-04 - accuracy: 0.9997 - 3s/epoch - 28ms/step
Epoch 39/50
119/119 - 3s - loss: 6.2406e-04 - accuracy: 0.9997 - 3s/epoch - 29ms/step
Epoch 40/50
119/119 - 3s - loss: 5.0593e-04 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 41/50
119/119 - 3s - loss: 4.2122e-04 - accuracy: 1.0000 - 3s/epoch - 24ms/step
Epoch 42/50
119/119 - 3s - loss: 3.5127e-04 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 43/50
119/119 - 3s - loss: 2.9751e-04 - accuracy: 1.0000 - 3s/epoch - 27ms/step
Epoch 44/50
119/119 - 3s - loss: 2.5839e-04 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 45/50
119/119 - 3s - loss: 2.2521e-04 - accuracy: 1.0000 - 3s/epoch - 24ms/step
Epoch 46/50
119/119 - 3s - loss: 1.9927e-04 - accuracy: 1.0000 - 3s/epoch - 28ms/step
Epoch 47/50
119/119 - 4s - loss: 1.7689e-04 - accuracy: 1.0000 - 4s/epoch - 30ms/step
Epoch 48/50
119/119 - 3s - loss: 1.5731e-04 - accuracy: 1.0000 - 3s/epoch - 26ms/step
Epoch 49/50
119/119 - 3s - loss: 1.4191e-04 - accuracy: 1.0000 - 3s/epoch - 25ms/step
Epoch 50/50
119/119 - 3s - loss: 1.2780e-04 - accuracy: 1.0000 - 3s/epoch - 25ms/step

KNeighborsClassifier(n_neighbors=3)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

from sklearn.linear_model import LogisticRegression

# Hacer predicciones en el conjunto de validación
pred1 = model.predict(X_valid)

pred2 = knn_classifier.predict(X_valid)

# Stack predicciones para crear nuevas características para el meta-modelo
stacked_predictions = np.column_stack((pred1, pred2))

# Entrenar el meta-modelo
meta_model = LogisticRegression()

y_valid_onehot = y_valid[:, 1].astype(int)

meta_model.fit(stacked_predictions, y_valid_onehot)

# Para hacer predicciones en nuevos datos
new_pred1 = model.predict(X_test)

new_pred2 = knn_classifier.predict(X_test)

stacked_new_predictions = np.column_stack((new_pred1, new_pred2))

final_predictions = meta_model.predict(stacked_new_predictions)

30/30 [==============================] - 1s 11ms/step
27/27 [==============================] - 0s 10ms/step

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 

# Calcular y mostrar la precisión
Y_test_onehot = y_test[:, 1].astype(int)

# Calculate accuracy
accuracy = accuracy_score(Y_test_onehot, final_predictions)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(Y_test_onehot, final_predictions)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(Y_test_onehot, final_predictions)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(Y_test_onehot, final_predictions)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(Y_test_onehot, final_predictions)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Accuracy: 0.99
Precision: 0.98
Recall: 0.93
F1 Score: 0.95
AUC: 0.96

_images/60c11ff9c0674d8c05c875a05026b4452561a9efac934c13f336390f3ec77438.png

Stacked Model con SMOTE#

from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.neighbors import KNeighborsClassifier

X_trainval, X_test, y_trainval, y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)

X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)

################## SMOTE #################

from imblearn.over_sampling import SMOTE

# Crear una instancia de SMOTE
smote = SMOTE(random_state=42)

y_for_smote = np.argmax(y_train, axis=1)

# Aplicar SMOTE al conjunto de entrenamiento
X_train_smote, y_for_smote = smote.fit_resample(X_train, y_train)

# Get unique values in the array
unique_values = np.unique(y_for_smote)

# Initialize one-hot encoded array with zeros
y_train_smote = np.zeros((len(y_for_smote), len(unique_values)))

# Fill in the one-hot array based on the original binary array
for i, value in enumerate(y_for_smote):

    y_train_smote[i, np.where(unique_values == value)] = 1

################## LSTM ##################

# matriz de 9363 x 20
embedding_matrix = np.zeros((len(d2v_model.wv.index_to_key)+ 1, 20))

model = Sequential()

model.add(Embedding(len(d2v_model.wv.index_to_key)+1,20,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

model.add(LSTM(50,return_sequences=False))

model.add(Dense(2,activation="softmax"))

model.compile(optimizer=Adam(),loss="binary_crossentropy",metrics = ['accuracy'])

batch_size = 32

history = model.fit(X_train_smote, y_train_smote, epochs =50, batch_size=batch_size, verbose = 0)

################## KNN ############################

# Create a kNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# Train the classifier on the training data
knn_classifier.fit(X_train_smote, y_train_smote)

from sklearn.linear_model import LogisticRegression

# Hacer predicciones en el conjunto de validación
pred1 = model.predict(X_valid)

pred2 = knn_classifier.predict(X_valid)

# Stack predicciones para crear nuevas características para el meta-modelo
stacked_predictions = np.column_stack((pred1, pred2))

# Entrenar el meta-modelo
meta_model = LogisticRegression()

y_valid_onehot = y_valid[:, 1].astype(int)

meta_model.fit(stacked_predictions, y_valid_onehot)

# Para hacer predicciones en nuevos datos
new_pred1 = model.predict(X_test)

new_pred2 = knn_classifier.predict(X_test)

stacked_new_predictions = np.column_stack((new_pred1, new_pred2))

final_predictions = meta_model.predict(stacked_new_predictions)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 

# Calcular y mostrar la precisión
Y_test_onehot = y_test[:, 1].astype(int)

# Calculate accuracy
accuracy = accuracy_score(Y_test_onehot, final_predictions)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(Y_test_onehot, final_predictions)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(Y_test_onehot, final_predictions)
print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1score = f1_score(Y_test_onehot, final_predictions)
print(f'F1 Score: {f1score:.2f}')

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(Y_test_onehot, final_predictions)
roc_auc = auc(fpr, tpr)

# Print AUC
print(f'AUC: {roc_auc:.2f}')

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

30/30 [==============================] - 1s 10ms/step
27/27 [==============================] - 0s 9ms/step
Accuracy: 0.91
Precision: 0.59
Recall: 0.96
F1 Score: 0.73
AUC: 0.93

_images/4958d19ab0d1b34a0ce8476e3fa7985473bf726264bea062977a7da6ac13b332.png

Conclusiones#

a continuación se resumen todas las métricas obtenidas de los modelos de clasificación:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Crear el dataframe con los datos
data = {
    'Modelo': ['LSTM', 'LSTM con SMOTE', 'KNN', 'KNN con SMOTE', 'Ridge', 'Ridge con SMOTE',
               'Naive Bayes', 'Naive Bayes con SMOTE', 'Random Forest', 'Random Forest con SMOTE',
               'Stacked Model', 'Stacked Model con SMOTE'],
    'Accuracy': [0.99, 0.91, 0.88, 0.82, 0.87, 0.78, 0.79, 0.79, 0.93, 0.86, 0.99, 0.91],
    'Precision': [0.98, 0.58, 0.55, 0.39, 0.42, 0.33, 0.33, 0.35, 0.87, 0.48, 0.98, 0.59],
    'Recall': [0.93, 0.95, 0.43, 0.71, 0.12, 0.65, 0.65, 0.71, 0.51, 0.80, 0.93, 0.96],
    'F1-score': [0.96, 0.72, 0.48, 0.50, 0.19, 0.44, 0.44, 0.47, 0.65, 0.60, 0.95, 0.73],
    'AUC': [0.97, 0.93, 0.69, 0.77, 0.55, 0.73, 0.73, 0.76, 0.75, 0.84, 0.96, 0.93]
}

df = pd.DataFrame(data)

# Crear el heatmap
plt.figure(figsize=(10, 8))
sns.set(font_scale=1)
sns.heatmap(df.set_index('Modelo'), cmap='RdYlGn', annot=True, fmt=".2f", linewidths=.5)
plt.title('Metricas de Modelos utilizados para la Clasificación')
plt.show()

_images/09ee6f31abb724a89cd1ecedb73e08adeb1d11260bd256863bc2621e1fba7e28.png

Modelado y Desempeño: La utilización del enfoque Doc2Vec para representar los mensajes en forma de vectores fue un paso determinante en la obtención de un modelo bastante efectivo para la tarea de clasificación de correos, permitiendo que los mensajes se convirtieran en entradas adecuadas para el modelo de deep learning. La elección de una red LSTM para la clasificación de mensajes demostró ser adecuada, dada la naturaleza secuencial del texto. Durante el entrenamiento, el modelo demostró una capacidad impresionante para adaptarse a los datos, logrando una precisión casi perfecta, además de superar por mucho a los modelos de machine learning clásicos empleados para esta tarea.
La técnica SMOTE demuestra no ser muy útil para mejorar las métricas del modelo LSTM para clasificar los correos, sin embargo, esta técnica sí favorece al performance de modelos de machine learning clásicos tales como KNN, Ridge y Random Forest. Adicionalmente, la técnica ha demostrado no ser tán efectiva en problemas de NLP tanto como lo es en problemas de regresión.
Modelo Stacked: El modelo Stacked usando la red neuronal con LSTM y KNN demuestra no tener un mal performance, sin embargo no supera a la técnica LSTM por sí sola, sin embargo, al evaluar la métrica de RECALL, se evidencia que el Stacked Model con SMOTE es el modelo que mejor RECALL posee de todos.