Grooming/dota2_training_lstm_algorithm.py at main · coaka/Grooming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# -*- coding: utf-8 -*-
"""
Created on Sat Nov  2 19:50:30 2024

@author: lenovo
"""
import torch
import numpy as np
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from plyer import notification  # For desktop notifications
import smtplib  # For sending emails
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.sparse import issparse
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import cross_val_score

nltk.download('punkt')
nltk.download('punkt_tab')

df = pd.DataFrame()

# Load your dataset
#df = pd.read_csv(r"Dota2_english_5.csv") #, encoding='cp1252')#r"C:\Users\lenovo\OneDrive\Desktop\PROJECT4\chatdda_data.csv")
file_paths = ["Dota2_english.csv", "Dota2_english_2.csv"]#, "Dota2_english_3.csv", "Dota2_english_4.csv",
              #"Dota2_english_5.csv", "Dota2_english_6.csv", "Dota2_english_7.csv",
              #"Dota2_english_8.csv","Dota2_english_9.csv","Dota2_english_10.csv","Dota2_english_11.csv"]
#df_list = [pd.read_csv(file) for file in file_paths]
#df = pd.concat(df_list, ignore_index=True)
for file in file_paths:
    #if filename.endswith('.csv'):
        print('Reading dataset files...')#os.path.join(dirname, filename))
        df1 = pd.read_csv(file)
        df = pd.concat([df, df1])
        del df1
print("Shapeeeeeeeeeee", df.shape[0])
# Drop 'text' column
#df.drop(columns=['text'], inplace=True, errors='ignore')
#df.dropna()
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
# print(df.columns)
# Define class labels
candidate_labels = ['chitchat',
                    'game features',
                    'coordination',
                    'toxic offense',
                    'gender discrimination',
                    'religious intolerance',
                    'racism']
#df['class'] = df['class'].map({0: 1, 1: 0})  # Uncomment this line if needed
# Separate features (text) and labels
#df['class'] = df['class'].astype(int)

X = df['text_en'].values.astype('str')
y = df['label'].values.astype('str')
from sklearn.manifold import TSNE

# Join tokens into strings for TF-IDF
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    lowercase=True
)
X= vectorizer.fit_transform(X)
texts = df['text_en'].astype(str)
labels = df['label']
max_length = 100  # Adjust based on text length
########PLOT TSNE################


# Encode class labels if they are not numerical
label_encoder = LabelEncoder()
label_encoder.fit(candidate_labels)
y = label_encoder.transform(y)
num_classes = len(label_encoder.classes_)

# Convert to dense array if sparse
if issparse(X):
    print("CONVER TO ARRAYYYYYYYYYYYYYYYYYYYYYYYYYY")
    X_ = X.toarray()


# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts.astype(str))
sequences = tokenizer.texts_to_sequences(texts.astype(str))
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')

# Define the LSTM model architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
max_words = 5000  # Number of words to consider
max_len = 100  # Maximum length of input sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    loss='categorical_crossentropy',  # Use 'sparse_categorical_crossentropy' if labels are integers
    optimizer=Adam(learning_rate=1e-4),
    metrics=['accuracy', AUC(name='auc', multi_label=True)]
)


# Display model architecture
model.summary()

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Train the LSTM model
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoder.fit(y_train.reshape(-1, 1))  # Fit on full training labels

############CROSS VALIDATION#################
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracy_per_fold = []

# K-Fold Cross-Validation
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f'Fold {fold + 1}')

    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]


    # Train the model on the training fold
    model.fit(X_train_fold, y_train_fold, epochs=2, batch_size=32, validation_data=(X_test, y_test))
    val_predictions = np.argmax(model.predict(X_val_fold), axis=1)
    accuracy = accuracy_score(y_val_fold, val_predictions)

    # Store the accuracy for this fold
    accuracy_per_fold.append(accuracy)
    print(f'Accuracy for fold {fold + 1}: {accuracy * 100:.2f}%')

# Calculate the average accuracy across all folds
average_accuracy = np.mean(accuracy_per_fold)
print(f'\nAverage Accuracy Across {k} Folds: {average_accuracy * 100:.2f}%')
############################################
# Predict grooming behavior on the test set
y_pred_proba = model.predict(X_test)  # Probabilities for class=1 (depression)
y_pred = model.predict(X_test)


y_pred_labels = y_pred.argmax(axis=1)
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

# Get class indices from the encoder
all_classes = encoder.categories_[0]  # Encoded class labels
present_classes = np.unique(y_test)   # Classes actually in y_test

# Create a filtered version of y_pred with only present classes
present_class_indices = [np.where(all_classes == cls)[0][0] for cls in present_classes]

# Compute AUC only for present classes
auc_scores = []
for idx in present_class_indices:
    auc = roc_auc_score(y_test_encoded[:, idx], y_pred[:, idx])
    auc_scores.append(auc)

# Print mean AUC across available classes
mean_auc = np.mean(auc_scores)
print("Mean AUC Score:", mean_auc)
auc = roc_auc_score(y_test, y_pred.detach().numpy(), multi_class='ovr')
print(f"AUC-ROC: {auc:.3f}")

# Confusion Matrix