-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_content_model.py
More file actions
75 lines (62 loc) · 3.14 KB
/
train_content_model.py
File metadata and controls
75 lines (62 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import os
import time
# --- Configuration ---
PROCESSED_DATA_DIR = 'data/processed'
TRAIN_DATA_FILE = 'train_df.csv'
def train_and_save_model():
"""
Loads the processed training data, trains the content-based Logistic
Regression model, and saves the model and vectorizer artifacts.
"""
print("Starting content-based model training...")
start_time = time.time()
# --- Step 1: Load Processed Training Data ---
train_data_path = os.path.join(PROCESSED_DATA_DIR, TRAIN_DATA_FILE)
try:
print(f"Loading training data from: {train_data_path}")
# Note: We're only loading the training data as that's all that's needed for training.
train_df = pd.read_csv(train_data_path)
print("Training data loaded successfully.")
except FileNotFoundError:
print(f"ERROR: Training data not found at '{train_data_path}'.")
print("Please run the 'data_loading.py' script first to generate the processed data.")
return
# --- Step 2: Feature Engineering and Preparation ---
print("Preparing content features for training...")
# Create the 'content' feature from genres and overview
# We add a placeholder for 'overview' in case the column is missing after loading
if 'overview' not in train_df.columns:
train_df['overview'] = ''
train_df['content'] = (train_df['genres'].fillna('') + ' ' + train_df['overview'].fillna(''))
train_df['content'] = train_df['content'].astype(str) # Ensure content is string
X_train_text = train_df['content']
y_train = train_df['liked']
print(f"Prepared {len(X_train_text)} text entries for vectorization.")
# --- Step 3: Vectorization (TF-IDF with N-grams) ---
print("Vectorizing text using TF-IDF with N-grams (max_features=5000)...")
# This matches the best parameters from our notebook experiments [cite: mars-mvp-data-preprocessing-2.ipynb]
vectorizer_ngram = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf_ngram = vectorizer_ngram.fit_transform(X_train_text)
print("Vectorization complete.")
# --- Step 4: Train the Logistic Regression Model ---
print("Training Logistic Regression model...")
# Using the best hyperparameters found in Phase 1 [cite: mars-mvp-data-preprocessing-2.ipynb]
log_reg_ngram = LogisticRegression(C=10, solver='liblinear', random_state=42)
log_reg_ngram.fit(X_train_tfidf_ngram, y_train)
print("Model training complete.")
# --- Step 5: Save the Artifacts ---
print("Saving model and vectorizer artifacts to the root directory...")
joblib.dump(log_reg_ngram, 'log_reg_ngram_model.pkl')
joblib.dump(vectorizer_ngram, 'vectorizer_ngram.pkl')
end_time = time.time()
print("\n--- Success! ---")
print(f"Process finished in {end_time - start_time:.2f} seconds.")
print("The following files have been saved in your project's root directory:")
print("- log_reg_ngram_model.pkl")
print("- vectorizer_ngram.pkl")
if __name__ == '__main__':
train_and_save_model()