1- from networksecurity .exception .exception import NetworkSecurityException
2- from networksecurity .logging .logger import logging
3- from networksecurity .entity .config_entity import DataIngestionConfig
4- from networksecurity .entity .artifact_entity import DataIngestionArtifact
51import os
62import sys
73import numpy as np
84import pandas as pd
95import pymongo
10- from typing import List
116from sklearn .model_selection import train_test_split
127from dotenv import load_dotenv
8+ from networksecurity .exception .exception import NetworkSecurityException
9+ from networksecurity .logging .logger import logging
10+ from networksecurity .entity .config_entity import DataIngestionConfig
11+ from networksecurity .entity .artifact_entity import DataIngestionArtifact
1312
1413load_dotenv ()
1514uri = os .getenv ('MONGODB_URI' )
1615
1716class DataIngestion :
1817 def __init__ (self , data_ingestion_config : DataIngestionConfig ):
19- try :
20- self .data_ingestion_config = data_ingestion_config
21- except Exception as e :
22- raise NetworkSecurityException (e , sys )
18+ self .data_ingestion_config = data_ingestion_config
2319
24- def export_collection_as_dataframe (self ):
20+ def export_collection_as_dataframe (self ) -> pd .DataFrame :
21+ """
22+ Export MongoDB collection to a DataFrame.
23+ """
2524 try :
2625 database_name = self .data_ingestion_config .database_name
2726 collection_name = self .data_ingestion_config .collection_name
28- self .mongo_client = pymongo .MongoClient (uri )
29- collection = self .mongo_client [database_name ][collection_name ]
30-
31- df = pd .DataFrame (list (collection .find ()))
32- if "_id" in df .columns .to_list ():
33- df = df .drop (columns = ["_id" ], axis = 1 )
34-
27+ logging .info (f"Connecting to MongoDB: { database_name } .{ collection_name } " )
28+
29+ client = pymongo .MongoClient (uri )
30+ collection = client [database_name ][collection_name ]
31+ cursor = collection .find ({})
32+
33+ df = pd .DataFrame (list (cursor ))
34+ if "_id" in df .columns :
35+ df .drop (columns = ["_id" ], axis = 1 , inplace = True )
36+
3537 df .replace ("" , np .nan , inplace = True )
38+ logging .info (f"DataFrame shape: { df .shape } " )
3639 return df
37-
38- except Exception as e :
39- raise NetworkSecurityException (e , sys )
40-
41- def export_data_into_feature_store (self , df : pd .DataFrame , feature_store_file_path : str ):
42- try :
43- dir_path = os .path .dirname (feature_store_file_path )
44- os .makedirs (dir_path , exist_ok = True )
45- df .to_csv (feature_store_file_path , index = False , header = True )
46- return df # Ensure df is returned
4740 except Exception as e :
4841 raise NetworkSecurityException (e , sys )
4942
50- def split_data_as_train_test (self , df : pd .DataFrame ):
43+ def split_data_as_train_test (self , df : pd .DataFrame ) -> None :
44+ """
45+ Split data into train and test sets.
46+ """
5147 try :
5248 train_set , test_set = train_test_split (
53- df , test_size = self .data_ingestion_config .train_test_split_ratio
54- )
55- logging .info ("Performed train test split on the dataframe" )
56-
57- dir_path = os .path .dirname (self .data_ingestion_config .training_file_path )
58- os .makedirs (dir_path , exist_ok = True )
59-
60- train_set .to_csv (
61- self .data_ingestion_config .training_file_path , index = False , header = True
62- )
63- test_set .to_csv (
64- self .data_ingestion_config .test_file_path , index = False , header = True # Changed from testing_file_path to test_file_path
49+ df , test_size = self .data_ingestion_config .train_test_split_ratio , random_state = 42
6550 )
51+ logging .info (f"Train set shape: { train_set .shape } , Test set shape: { test_set .shape } " )
52+
53+ os .makedirs (os .path .dirname (self .data_ingestion_config .training_file_path ), exist_ok = True )
54+ train_set .to_csv (self .data_ingestion_config .training_file_path , index = False , header = True )
55+ test_set .to_csv (self .data_ingestion_config .test_file_path , index = False , header = True )
6656 except Exception as e :
6757 raise NetworkSecurityException (e , sys )
6858
69- def initiate_data_ingestion (self ):
70- logging .info ("Entered the data ingestion method or component" )
59+ def initiate_data_ingestion (self ) -> DataIngestionArtifact :
60+ """
61+ Main method to initiate data ingestion.
62+ """
7163 try :
72- dataframe = self .export_collection_as_dataframe ()
73- dataframe = self .export_data_into_feature_store (
74- dataframe , self .data_ingestion_config .feature_store_file_path
75- )
76- self .split_data_as_train_test (dataframe )
77- data_ingestion_artifact = DataIngestionArtifact (
64+ logging .info ("Starting data ingestion" )
65+ df = self .export_collection_as_dataframe ()
66+ self .split_data_as_train_test (df )
67+
68+ artifact = DataIngestionArtifact (
7869 feature_store_file_path = self .data_ingestion_config .feature_store_file_path ,
7970 train_file_path = self .data_ingestion_config .training_file_path ,
80- test_file_path = self .data_ingestion_config .test_file_path , # Changed from testing_file_path to test_file_path
71+ test_file_path = self .data_ingestion_config .test_file_path
8172 )
82- return data_ingestion_artifact
73+ logging .info ("Data ingestion completed successfully" )
74+ return artifact
8375 except Exception as e :
84- raise NetworkSecurityException (e , sys )
76+ raise NetworkSecurityException (e , sys )
0 commit comments