Grooming/pan12.py at main · coaka/Grooming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
"""
Created on Sat Nov  2 19:50:30 2024

@author: lenovo
"""
import numpy as np
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from plyer import notification  # For desktop notifications
import smtplib  # For sending emails
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

nltk.download('punkt')
nltk.download('punkt_tab')
def send_email_alert():
    sender_email = str("ibra.halema@gmail.com" ) # Replace with your email
    receiver_email = "ibrahim_alhlima@uomosul.edu.iq"  # Replace with recipient's email
    password = str("uedr pwva czvj tqew")#"GeesT2005$"  # Replace with your email password

    # Create the email content
    subject = "Grooming Alert"
    # body = f"A grooming message has been detected: {"your chaild under grooming help him"}"
    val = 'A grooming message has been detected, your chaild under grooming'
    # body = Printf("A grooming message has been detected your chaild under grooming help him");
    #body =print(f" {val} .")
    import ssl

    msg = MIMEMultipart()
    msg['From'] = sender_email
    msg['To'] = receiver_email
    msg['Subject'] = subject
    msg.attach(MIMEText(val, 'plain'))
    context = ssl.create_default_context()

    # Send the email
    with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:  # For Gmail
        #server.starttls(context=context)
        #server.login(sender_email, password)
        #server.sendmail(sender_email, receiver_email ,msg.as_string())
        #server.send_message(msg)
        #server.quit()
        server.login(sender_email, password)
        server.sendmail(sender_email, receiver_email, msg.as_string())
    print("Email sent successfully!")

# Load your dataset
df = pd.read_csv(r"pan12_dataset.csv", encoding='cp1252')#r"C:\Users\lenovo\OneDrive\Desktop\PROJECT4\chatdda_data.csv")
df['segment'] = df['segment'].fillna('')
df['label'] = df['label'].fillna('')

# Separate features (segment) and labels
print("Shapeeeeeeeeeee", df.shape)
#df['label'] = df['label'].astype(int)
df2=df[:1000]
y=df['label']
candidate_labels=['non-predator','predator']
#df['label'] = df['label'].map({0: 1, 1: 0})  # Uncomment this line if needed
label_encoder = LabelEncoder()
label_encoder.fit(candidate_labels)
y = label_encoder.transform(y)
X = df['segment']

df = df[~df['segment'].isnull()]
df = df[df['label'].str.isnumeric()]
df[['label']] = df[['label']].astype(int)

#y = df['label']
from sklearn.manifold import TSNE

# Join tokens into strings for TF-IDF
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    lowercase=True
)
X= vectorizer.fit_transform(X)
texts = df['segment'].values
labels = df['label'].values
max_length = 100  # Adjust based on text length
########PLOT TSNE################
tsne = TSNE(n_components=2, random_state=42, init='random')
X_tsne = tsne.fit_transform(X)
# Plot results
plt.figure(figsize=(12, 9))
scatter=plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.7)
handles, _ = scatter.legend_elements()
plt.legend(handles, candidate_labels, title="Classes", loc='upper left', bbox_to_anchor=(1.13, 0.7),
    ncol=1, frameon=False)
plt.tight_layout()
plt.colorbar(label="Class")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.title("t-SNE Visualization of PAN12 Dataset")
plt.savefig("Pan12_Tsne.jpg", bbox_inches='tight')
plt.show()
print("Finish ploting Tsne")
###################################
# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')
##############################################################
# Convert the text to sequences of integers
#X = tokenizer.texts_to_sequences(dataset['clean_text'])

# Pad sequences to ensure uniform input length
#X_padded = pad_sequences(X, maxlen=max_len)

# Assign labels
#y = dataset['grooming_label']

# Define the LSTM model architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
max_words = 5000  # Number of words to consider
max_len = 100  # Maximum length of input sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC


model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-4),
    metrics=['accuracy', AUC(name='auc')]
)

# Display model architecture
model.summary()

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Train the LSTM model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predict grooming behavior on the test set
y_pred_proba = model.predict(X_test)  # Probabilities for class=1 (depression)
y_pred = model.predict(X_test)
y_pred = [np.round(x) for x in y_pred]

# AUC-ROC Score
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc:.3f}")

# Confusion Matrix
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))