Skip to content

Commit 745b91b

Browse files
data validation completed
1 parent f3abefa commit 745b91b

18 files changed

Lines changed: 331 additions & 6 deletions

File tree

=4.0

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Collecting pymongo
2+
Using cached pymongo-4.10.1-cp310-cp310-macosx_10_9_x86_64.whl.metadata (22 kB)
3+
Requirement already satisfied: dnspython<3.0.0,>=1.16.0 in ./netsec_env/lib/python3.10/site-packages (from pymongo) (1.16.0)
4+
Using cached pymongo-4.10.1-cp310-cp310-macosx_10_9_x86_64.whl (835 kB)
5+
Installing collected packages: pymongo
6+
Successfully installed pymongo-4.10.1

data_schema/schema.yaml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
columns:
2+
-having_IP_Address: int64
3+
-URL_Length: int64
4+
-Shortining_Service: int64
5+
-having_At_Symbol: int64
6+
-double_slash_redirecting: int64
7+
-Prefix_Suffix: int64
8+
-having_Sub_Domain: int64
9+
-SSLfinal_State: int64
10+
-Domain_registeration_length: int64
11+
-Favicon: int64
12+
-port: int64
13+
-HTTPS_token: int64
14+
-Request_URL: int64
15+
-URL_of_Anchor: int64
16+
-Links_in_tags: int64
17+
-SFH: int64
18+
-Submitting_to_email: int64
19+
-Abnormal_URL: int64
20+
-Redirect: int64
21+
-on_mouseover: int64
22+
-RightClick: int64
23+
-popUpWidnow: int64
24+
-Iframe: int64
25+
-age_of_domain: int64
26+
-DNSRecord: int64
27+
-web_traffic: int64
28+
-Page_Rank: int64
29+
-Google_Index: int64
30+
-Links_pointing_to_page: int64
31+
-Statistical_report: int64
32+
-Result: int64
33+
34+
numerical_columns:
35+
-having_IP_Address
36+
-URL_Length
37+
-Shortining_Service
38+
-having_At_Symbol
39+
-double_slash_redirecting
40+
-Prefix_Suffix
41+
-having_Sub_Domain
42+
-SSLfinal_State
43+
-Domain_registeration_length
44+
-Favicon
45+
-port
46+
-HTTPS_token
47+
-Request_URL
48+
-URL_of_Anchor
49+
-Links_in_tags
50+
-SFH
51+
-Submitting_to_email
52+
-Abnormal_URL
53+
-Redirect
54+
-on_mouseover
55+
-RightClick
56+
-popUpWidnow
57+
-Iframe
58+
-age_of_domain
59+
-DNSRecord
60+
-web_traffic
61+
-Page_Rank
62+
-Google_Index
63+
-Links_pointing_to_page
64+
-Statistical_report
65+
-Result

main.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from networksecurity.components.data_ingestion import DataIngestion
2+
from networksecurity.components.data_validation import DataValidation
23
from networksecurity.exception.exception import NetworkSecurityException
34
from networksecurity.logging.logger import logging
4-
from networksecurity.entity.config_entity import DataIngestionConfig
5+
from networksecurity.entity.config_entity import DataIngestionConfig, DataValidationConfig
56
from networksecurity.entity.config_entity import Training_pipeline_config
67
import sys
78

@@ -12,6 +13,15 @@
1213
data_ingestion=DataIngestion(data_ingestion_config=dataingestionconfig)
1314
logging.info("starting data ingestion")
1415
dataingestionartifact=data_ingestion.initiate_data_ingestion()
16+
logging.info("data ingestion completed")
1517
print(dataingestionartifact)
18+
data_validation_config=DataValidationConfig(training_pipeline_config)
19+
data_validation=DataValidation(dataingestionartifact,data_validation_config)
20+
logging.info("starting data validation")
21+
data_validation_artifact=data_validation.initiate_data_validation()
22+
logging.info("data validation completed")
23+
print(data_validation_artifact)
24+
25+
1626
except Exception as e:
1727
raise NetworkSecurityException(e, sys)
Binary file not shown.
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from networksecurity.entity.artifact_entity import DataIngestionArtifact, DataValidationArtifact
2+
from networksecurity.entity.config_entity import DataValidationConfig
3+
from networksecurity.exception.exception import NetworkSecurityException
4+
from networksecurity.logging.logger import logging
5+
from networksecurity.constants import SCHEMA_FILE_PATH
6+
from scipy.stats import ks_2samp
7+
import os
8+
import pandas as pd
9+
import sys
10+
from networksecurity.utils.main_utils import read_yaml_file, write_yaml_file
11+
12+
class DataValidation:
13+
def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
14+
data_validation_config: DataValidationConfig):
15+
try:
16+
self.data_ingestion_artifact = data_ingestion_artifact
17+
self.data_validation_config = data_validation_config
18+
self.schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
19+
except Exception as e:
20+
raise NetworkSecurityException(e, sys)
21+
22+
@staticmethod
23+
def read_data(file_path) -> pd.DataFrame:
24+
try:
25+
return pd.read_csv(file_path)
26+
except Exception as e:
27+
raise NetworkSecurityException(e, sys)
28+
29+
def validate_number_of_columns(self, dataframe: pd.DataFrame) -> bool:
30+
try:
31+
number_of_columns = len(self.schema_config["columns"])
32+
logging.info(f"Number of columns in dataframe: {len(dataframe.columns)}")
33+
logging.info(f"Required number of columns as per schema: {number_of_columns}")
34+
if len(dataframe.columns) == number_of_columns:
35+
return True
36+
return False
37+
except Exception as e:
38+
raise NetworkSecurityException(e, sys)
39+
40+
def detect_drift_in_data(self, base_df, current_df, threshold=0.5) -> bool:
41+
try:
42+
status = True
43+
report = {}
44+
for column in base_df.columns:
45+
d1 = base_df[column]
46+
d2 = current_df[column]
47+
is_same_dist = ks_2samp(d1, d2)
48+
if threshold <= is_same_dist.pvalue:
49+
is_found = False
50+
else:
51+
is_found = True
52+
status = False
53+
report.update({column: {
54+
"p_value": float(is_same_dist.pvalue),
55+
"drift_status": is_found
56+
}})
57+
58+
drift_report_file_path = os.path.join(
59+
self.data_validation_config.drift_report_file_path,
60+
"drift_report.json"
61+
)
62+
63+
# Create directory
64+
dir_path = os.path.dirname(drift_report_file_path)
65+
os.makedirs(dir_path, exist_ok=True)
66+
write_yaml_file(file_path=drift_report_file_path, content=report, replace=True)
67+
return status
68+
except Exception as e:
69+
raise NetworkSecurityException(e, sys)
70+
71+
def initiate_data_validation(self) -> DataValidationArtifact:
72+
try:
73+
error_message = ""
74+
train_file_path = self.data_ingestion_artifact.train_file_path
75+
test_file_path = self.data_ingestion_artifact.test_file_path
76+
77+
# Read data from train and test file location
78+
train_df = DataValidation.read_data(train_file_path)
79+
test_df = DataValidation.read_data(test_file_path)
80+
81+
# Validate number of columns
82+
status = self.validate_number_of_columns(dataframe=train_df)
83+
if not status:
84+
error_message = f"{error_message}Train dataframe does not contain all columns."
85+
logging.info(f"{error_message}Train dataframe does not contain all columns.")
86+
status = self.validate_number_of_columns(dataframe=test_df)
87+
if not status:
88+
error_message = f"{error_message}Test dataframe does not contain all columns."
89+
logging.info(f"{error_message}Test dataframe does not contain all columns.")
90+
91+
# Check data drift
92+
status = self.detect_drift_in_data(base_df=train_df, current_df=test_df)
93+
94+
# Create directory for valid data
95+
dir_path = os.path.dirname(self.data_validation_config.valid_train_file_path)
96+
os.makedirs(dir_path, exist_ok=True)
97+
98+
train_df.to_csv(self.data_validation_config.valid_train_file_path, index=False, header=True)
99+
test_df.to_csv(self.data_validation_config.valid_test_file_path, index=False, header=True)
100+
101+
data_validation_artifact = DataValidationArtifact(
102+
validation_status=status,
103+
valid_train_file_path=self.data_validation_config.valid_train_file_path,
104+
valid_test_file_path=self.data_validation_config.valid_test_file_path,
105+
invalid_train_file_path=self.data_validation_config.invalid_train_file_path,
106+
invalid_test_file_path=self.data_validation_config.invalid_test_file_path,
107+
drift_report_file_path=self.data_validation_config.drift_report_file_path
108+
)
109+
110+
return data_validation_artifact
111+
112+
except Exception as e:
113+
raise NetworkSecurityException(e, sys)

networksecurity/constants/Training_pipeline/__init__.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# networksecurity/constants/Training_pipeline/__init__.py
2+
13
import os
24
import sys
35
import numpy as np
@@ -14,12 +16,31 @@
1416
TRAIN_FILE_NAME: str = "train.csv"
1517
TEST_FILE_NAME: str = "test.csv"
1618

19+
SCHEMA_FILE_PATH = os.path.join("data_schema", "schema.yaml")
20+
1721
"""
18-
Data Ingestion related# Data Ingestion related constant start with DATA_INGESTION VAR NAME
22+
Data Ingestion related constant start with DATA_INGESTION VAR NAME
1923
"""
2024
DATA_INGESTION_COLLECTION_NAME: str = "network_data"
2125
DATA_INGESTION_DATABASE_NAME: str = "AUSTINAI"
2226
DATA_INGESTION_DIR_NAME: str = "data_ingestion"
2327
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
2428
DATA_INGESTION_INGESTED_DIR: str = "ingested"
25-
DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO: float = 0.2
29+
DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO: float = 0.2
30+
31+
"""
32+
Data Validation related constant start with DATA_VALIDATION VAR NAME
33+
"""
34+
DATA_VALIDATION_DIR_NAME: str = "data_validation"
35+
DATA_VALIDATION_VALID_DIR: str = "validated"
36+
DATA_VALIDATION_INVALID_DIR: str = "invalid"
37+
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
38+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
39+
40+
# Adding the missing constants
41+
DATA_VALIDATION_TRAIN_FILE_NAME: str = "train.csv"
42+
DATA_VALIDATION_TEST_FILE_NAME: str = "test.csv"
43+
44+
"""
45+
Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME
46+
"""
Binary file not shown.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# networksecurity/constants/__init__.py
2+
3+
from networksecurity.constants.Training_pipeline import (
4+
TARGET_COLUMN,
5+
PIPELINE_NAME,
6+
ARTIFACT_DIR,
7+
FILE_NAME,
8+
TRAIN_FILE_NAME,
9+
TEST_FILE_NAME,
10+
SCHEMA_FILE_PATH,
11+
# Data Ingestion Constants
12+
DATA_INGESTION_COLLECTION_NAME,
13+
DATA_INGESTION_DATABASE_NAME,
14+
DATA_INGESTION_DIR_NAME,
15+
DATA_INGESTION_FEATURE_STORE_DIR,
16+
DATA_INGESTION_INGESTED_DIR,
17+
DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO,
18+
# Data Validation Constants
19+
DATA_VALIDATION_DIR_NAME,
20+
DATA_VALIDATION_VALID_DIR,
21+
DATA_VALIDATION_INVALID_DIR,
22+
DATA_VALIDATION_DRIFT_REPORT_DIR,
23+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
24+
DATA_VALIDATION_TRAIN_FILE_NAME,
25+
DATA_VALIDATION_TEST_FILE_NAME,
26+
)
27+
28+
__all__ = [
29+
"TARGET_COLUMN",
30+
"PIPELINE_NAME",
31+
"ARTIFACT_DIR",
32+
"FILE_NAME",
33+
"TRAIN_FILE_NAME",
34+
"TEST_FILE_NAME",
35+
"SCHEMA_FILE_PATH",
36+
"DATA_INGESTION_COLLECTION_NAME",
37+
"DATA_INGESTION_DATABASE_NAME",
38+
"DATA_INGESTION_DIR_NAME",
39+
"DATA_INGESTION_FEATURE_STORE_DIR",
40+
"DATA_INGESTION_INGESTED_DIR",
41+
"DATA_INGESTION_TRAIN_TEST_SPLIT_RATIO",
42+
"DATA_VALIDATION_DIR_NAME",
43+
"DATA_VALIDATION_VALID_DIR",
44+
"DATA_VALIDATION_INVALID_DIR",
45+
"DATA_VALIDATION_DRIFT_REPORT_DIR",
46+
"DATA_VALIDATION_DRIFT_REPORT_FILE_NAME",
47+
"DATA_VALIDATION_TRAIN_FILE_NAME",
48+
"DATA_VALIDATION_TEST_FILE_NAME",
49+
]
772 Bytes
Binary file not shown.
369 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)