Skip to content

Commit 20c108c

Browse files
Add complete ML pipeline with model training, testing, and API components
1 parent 64b3da8 commit 20c108c

25 files changed

Lines changed: 977 additions & 409 deletions

Dockerfile

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM python:3.9-slim
2+
3+
WORKDIR /app
4+
5+
# Install system dependencies
6+
RUN apt-get update && \
7+
apt-get install -y --no-install-recommends \
8+
build-essential \
9+
git \
10+
&& apt-get clean \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
# Copy requirements first to leverage Docker cache
14+
COPY requirements.txt .
15+
RUN pip install --no-cache-dir -r requirements.txt
16+
17+
# Copy the rest of the application
18+
COPY . .
19+
20+
# Install the package in development mode
21+
RUN pip install -e .
22+
23+
# Set environment variables
24+
ENV PYTHONPATH=/app
25+
ENV PYTHONUNBUFFERED=1
26+
27+
# Expose ports for FastAPI and MLflow UI
28+
EXPOSE 8000
29+
EXPOSE 5000
30+
31+
# Command to run when container starts
32+
# Use uvicorn to serve the FastAPI application
33+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

data_schema/schema.yaml

Lines changed: 64 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,67 @@
1+
#data_schema/schema.yaml
2+
13
columns:
2-
-having_IP_Address: int64
3-
-URL_Length: int64
4-
-Shortining_Service: int64
5-
-having_At_Symbol: int64
6-
-double_slash_redirecting: int64
7-
-Prefix_Suffix: int64
8-
-having_Sub_Domain: int64
9-
-SSLfinal_State: int64
10-
-Domain_registeration_length: int64
11-
-Favicon: int64
12-
-port: int64
13-
-HTTPS_token: int64
14-
-Request_URL: int64
15-
-URL_of_Anchor: int64
16-
-Links_in_tags: int64
17-
-SFH: int64
18-
-Submitting_to_email: int64
19-
-Abnormal_URL: int64
20-
-Redirect: int64
21-
-on_mouseover: int64
22-
-RightClick: int64
23-
-popUpWidnow: int64
24-
-Iframe: int64
25-
-age_of_domain: int64
26-
-DNSRecord: int64
27-
-web_traffic: int64
28-
-Page_Rank: int64
29-
-Google_Index: int64
30-
-Links_pointing_to_page: int64
31-
-Statistical_report: int64
32-
-Result: int64
4+
having_IP_Address: int64
5+
URL_Length: int64
6+
Shortining_Service: int64
7+
having_At_Symbol: int64
8+
double_slash_redirecting: int64
9+
Prefix_Suffix: int64
10+
having_Sub_Domain: int64
11+
SSLfinal_State: int64
12+
Domain_registeration_length: int64
13+
Favicon: int64
14+
port: int64
15+
HTTPS_token: int64
16+
Request_URL: int64
17+
URL_of_Anchor: int64
18+
Links_in_tags: int64
19+
SFH: int64
20+
Submitting_to_email: int64
21+
Abnormal_URL: int64
22+
Redirect: int64
23+
on_mouseover: int64
24+
RightClick: int64
25+
popUpWidnow: int64
26+
Iframe: int64
27+
age_of_domain: int64
28+
DNSRecord: int64
29+
web_traffic: int64
30+
Page_Rank: int64
31+
Google_Index: int64
32+
Links_pointing_to_page: int64
33+
Statistical_report: int64
34+
Result: int64
3335

3436
numerical_columns:
35-
-having_IP_Address
36-
-URL_Length
37-
-Shortining_Service
38-
-having_At_Symbol
39-
-double_slash_redirecting
40-
-Prefix_Suffix
41-
-having_Sub_Domain
42-
-SSLfinal_State
43-
-Domain_registeration_length
44-
-Favicon
45-
-port
46-
-HTTPS_token
47-
-Request_URL
48-
-URL_of_Anchor
49-
-Links_in_tags
50-
-SFH
51-
-Submitting_to_email
52-
-Abnormal_URL
53-
-Redirect
54-
-on_mouseover
55-
-RightClick
56-
-popUpWidnow
57-
-Iframe
58-
-age_of_domain
59-
-DNSRecord
60-
-web_traffic
61-
-Page_Rank
62-
-Google_Index
63-
-Links_pointing_to_page
64-
-Statistical_report
65-
-Result
37+
- having_IP_Address
38+
- URL_Length
39+
- Shortining_Service
40+
- having_At_Symbol
41+
- double_slash_redirecting
42+
- Prefix_Suffix
43+
- having_Sub_Domain
44+
- SSLfinal_State
45+
- Domain_registeration_length
46+
- Favicon
47+
- port
48+
- HTTPS_token
49+
- Request_URL
50+
- URL_of_Anchor
51+
- Links_in_tags
52+
- SFH
53+
- Submitting_to_email
54+
- Abnormal_URL
55+
- Redirect
56+
- on_mouseover
57+
- RightClick
58+
- popUpWidnow
59+
- Iframe
60+
- age_of_domain
61+
- DNSRecord
62+
- web_traffic
63+
- Page_Rank
64+
- Google_Index
65+
- Links_pointing_to_page
66+
- Statistical_report
67+
- Result

main.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,33 @@
11
from networksecurity.components.data_ingestion import DataIngestion
22
from networksecurity.components.data_validation import DataValidation
33
from networksecurity.components.data_transformation import DataTransformation
4+
from networksecurity.components.model_trainer import ModelTrainer
5+
from networksecurity.entity.config_entity import DataIngestionConfig, DataValidationConfig, DataTransformationConfig, ModelTrainerConfig, TrainingPipelineConfig
46
from networksecurity.exception.exception import NetworkSecurityException
57
from networksecurity.logging.logger import logging
6-
from networksecurity.entity.config_entity import DataIngestionConfig, DataValidationConfig,DataTransformationConfig
7-
from networksecurity.entity.config_entity import Training_pipeline_config
88
import sys
99

1010
if __name__ == "__main__":
1111
try:
12-
training_pipeline_config=Training_pipeline_config()
13-
dataingestionconfig=DataIngestionConfig(training_pipeline_config)
14-
data_ingestion=DataIngestion(data_ingestion_config=dataingestionconfig)
15-
logging.info("starting data ingestion")
16-
dataingestionartifact=data_ingestion.initiate_data_ingestion()
17-
logging.info("data ingestion completed")
18-
print(dataingestionartifact)
19-
data_validation_config=DataValidationConfig(training_pipeline_config)
20-
data_validation=DataValidation(dataingestionartifact,data_validation_config)
21-
logging.info("starting data validation")
22-
data_validation_artifact=data_validation.initiate_data_validation()
23-
logging.info("data validation completed")
24-
print(data_validation_artifact)
25-
datatransformationconfig=DataTransformationConfig(training_pipeline_config)
26-
data_transformation=DataTransformation(data_validation_artifact, datatransformationconfig)
27-
logging.info("starting data transformation")
28-
data_transformation_artifact=data_transformation.initiate_data_transformation()
29-
logging.info("data transformation completed")
30-
print(data_transformation_artifact)
12+
# Initialize configurations
13+
training_pipeline_config = TrainingPipelineConfig()
14+
data_ingestion_config = DataIngestionConfig(training_pipeline_config)
15+
data_validation_config = DataValidationConfig(training_pipeline_config)
16+
data_transformation_config = DataTransformationConfig(training_pipeline_config)
17+
model_trainer_config = ModelTrainerConfig(training_pipeline_config)
3118

19+
# Run pipeline
20+
data_ingestion = DataIngestion(data_ingestion_config)
21+
data_ingestion_artifact = data_ingestion.initiate_data_ingestion()
22+
23+
data_validation = DataValidation(data_ingestion_artifact, data_validation_config)
24+
data_validation_artifact = data_validation.initiate_data_validation()
25+
26+
data_transformation = DataTransformation(data_validation_artifact, data_transformation_config)
27+
data_transformation_artifact = data_transformation.initiate_data_transformation()
28+
29+
model_trainer = ModelTrainer(model_trainer_config, data_transformation_artifact)
30+
model_trainer_artifact = model_trainer.initiate_model_trainer()
3231

3332
except Exception as e:
34-
raise NetworkSecurityException(e, sys)
33+
raise NetworkSecurityException(e, sys)
Lines changed: 43 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,76 @@
1-
from networksecurity.exception.exception import NetworkSecurityException
2-
from networksecurity.logging.logger import logging
3-
from networksecurity.entity.config_entity import DataIngestionConfig
4-
from networksecurity.entity.artifact_entity import DataIngestionArtifact
51
import os
62
import sys
73
import numpy as np
84
import pandas as pd
95
import pymongo
10-
from typing import List
116
from sklearn.model_selection import train_test_split
127
from dotenv import load_dotenv
8+
from networksecurity.exception.exception import NetworkSecurityException
9+
from networksecurity.logging.logger import logging
10+
from networksecurity.entity.config_entity import DataIngestionConfig
11+
from networksecurity.entity.artifact_entity import DataIngestionArtifact
1312

1413
load_dotenv()
1514
uri = os.getenv('MONGODB_URI')
1615

1716
class DataIngestion:
1817
def __init__(self, data_ingestion_config: DataIngestionConfig):
19-
try:
20-
self.data_ingestion_config = data_ingestion_config
21-
except Exception as e:
22-
raise NetworkSecurityException(e, sys)
18+
self.data_ingestion_config = data_ingestion_config
2319

24-
def export_collection_as_dataframe(self):
20+
def export_collection_as_dataframe(self) -> pd.DataFrame:
21+
"""
22+
Export MongoDB collection to a DataFrame.
23+
"""
2524
try:
2625
database_name = self.data_ingestion_config.database_name
2726
collection_name = self.data_ingestion_config.collection_name
28-
self.mongo_client = pymongo.MongoClient(uri)
29-
collection = self.mongo_client[database_name][collection_name]
30-
31-
df = pd.DataFrame(list(collection.find()))
32-
if "_id" in df.columns.to_list():
33-
df = df.drop(columns=["_id"], axis=1)
34-
27+
logging.info(f"Connecting to MongoDB: {database_name}.{collection_name}")
28+
29+
client = pymongo.MongoClient(uri)
30+
collection = client[database_name][collection_name]
31+
cursor = collection.find({})
32+
33+
df = pd.DataFrame(list(cursor))
34+
if "_id" in df.columns:
35+
df.drop(columns=["_id"], axis=1, inplace=True)
36+
3537
df.replace("", np.nan, inplace=True)
38+
logging.info(f"DataFrame shape: {df.shape}")
3639
return df
37-
38-
except Exception as e:
39-
raise NetworkSecurityException(e, sys)
40-
41-
def export_data_into_feature_store(self, df: pd.DataFrame, feature_store_file_path: str):
42-
try:
43-
dir_path = os.path.dirname(feature_store_file_path)
44-
os.makedirs(dir_path, exist_ok=True)
45-
df.to_csv(feature_store_file_path, index=False, header=True)
46-
return df # Ensure df is returned
4740
except Exception as e:
4841
raise NetworkSecurityException(e, sys)
4942

50-
def split_data_as_train_test(self, df: pd.DataFrame):
43+
def split_data_as_train_test(self, df: pd.DataFrame) -> None:
44+
"""
45+
Split data into train and test sets.
46+
"""
5147
try:
5248
train_set, test_set = train_test_split(
53-
df, test_size=self.data_ingestion_config.train_test_split_ratio
54-
)
55-
logging.info("Performed train test split on the dataframe")
56-
57-
dir_path = os.path.dirname(self.data_ingestion_config.training_file_path)
58-
os.makedirs(dir_path, exist_ok=True)
59-
60-
train_set.to_csv(
61-
self.data_ingestion_config.training_file_path, index=False, header=True
62-
)
63-
test_set.to_csv(
64-
self.data_ingestion_config.test_file_path, index=False, header=True # Changed from testing_file_path to test_file_path
49+
df, test_size=self.data_ingestion_config.train_test_split_ratio, random_state=42
6550
)
51+
logging.info(f"Train set shape: {train_set.shape}, Test set shape: {test_set.shape}")
52+
53+
os.makedirs(os.path.dirname(self.data_ingestion_config.training_file_path), exist_ok=True)
54+
train_set.to_csv(self.data_ingestion_config.training_file_path, index=False, header=True)
55+
test_set.to_csv(self.data_ingestion_config.test_file_path, index=False, header=True)
6656
except Exception as e:
6757
raise NetworkSecurityException(e, sys)
6858

69-
def initiate_data_ingestion(self):
70-
logging.info("Entered the data ingestion method or component")
59+
def initiate_data_ingestion(self) -> DataIngestionArtifact:
60+
"""
61+
Main method to initiate data ingestion.
62+
"""
7163
try:
72-
dataframe = self.export_collection_as_dataframe()
73-
dataframe = self.export_data_into_feature_store(
74-
dataframe, self.data_ingestion_config.feature_store_file_path
75-
)
76-
self.split_data_as_train_test(dataframe)
77-
data_ingestion_artifact = DataIngestionArtifact(
64+
logging.info("Starting data ingestion")
65+
df = self.export_collection_as_dataframe()
66+
self.split_data_as_train_test(df)
67+
68+
artifact = DataIngestionArtifact(
7869
feature_store_file_path=self.data_ingestion_config.feature_store_file_path,
7970
train_file_path=self.data_ingestion_config.training_file_path,
80-
test_file_path=self.data_ingestion_config.test_file_path, # Changed from testing_file_path to test_file_path
71+
test_file_path=self.data_ingestion_config.test_file_path
8172
)
82-
return data_ingestion_artifact
73+
logging.info("Data ingestion completed successfully")
74+
return artifact
8375
except Exception as e:
84-
raise NetworkSecurityException(e, sys)
76+
raise NetworkSecurityException(e, sys)

0 commit comments

Comments
 (0)