|
1 | | -import argparse |
2 | | -import json |
3 | | -import os |
4 | | -import time |
5 | | - |
6 | | -import pandas as pd |
7 | | - |
8 | | - |
9 | | -def parse_args() -> argparse.Namespace: |
10 | | - parser = argparse.ArgumentParser(description="Filter large CSV by DDOS IPs using chunked processing") |
11 | | - parser.add_argument( |
12 | | - "input", |
13 | | - nargs='+', |
14 | | - help="Path(s) to input CSV file(s) - can specify multiple files", |
15 | | - ) |
16 | | - parser.add_argument( |
17 | | - "--output", |
18 | | - default="decoded_pcap_data_set1_full_ddos.csv", |
19 | | - help="Path to output CSV file", |
20 | | - ) |
21 | | - parser.add_argument( |
22 | | - "--ip-map", |
23 | | - default="full_ip_map.json", |
24 | | - dest="ip_map_path", |
25 | | - help="Path to JSON mapping of IP string to integer id", |
26 | | - ) |
27 | | - parser.add_argument( |
28 | | - "--chunksize", |
29 | | - type=int, |
30 | | - default=1000000, |
31 | | - help="Number of rows per chunk when reading the input CSV", |
32 | | - ) |
33 | | - return parser.parse_args() |
34 | | - |
35 | | - |
36 | | -def main() -> None: |
37 | | - args = parse_args() |
38 | | - |
39 | | - # Load the mapping file |
40 | | - with open(args.ip_map_path, "r") as f: |
41 | | - ip_map = json.load(f) |
42 | | - |
43 | | - # Default IPs extracted from the provided image; used if no IPs are passed |
44 | | - extracted_ips = [ |
45 | | - "172.28.3.121", |
46 | | - "172.28.213.189", |
47 | | - "172.28.214.102", |
48 | | - "172.28.23.163", |
49 | | - "172.28.197.120", |
50 | | - "77.91.104.22", |
51 | | - "201.89.32.16", |
52 | | - "68.91.226.37", |
53 | | - "172.28.3.242", |
54 | | - "172.28.16.39", |
55 | | - "172.28.3.56", |
56 | | - "172.28.13.198", |
57 | | - "172.28.133.17", |
58 | | - "172.28.220.39", |
59 | | - "172.28.12.167", |
60 | | - "172.28.13.29", |
61 | | - "172.28.6.9", |
62 | | - "172.28.27.162", |
63 | | - "172.28.197.210", |
64 | | - "172.28.196.167", |
65 | | - "172.28.212.207", |
66 | | - "172.28.218.87", |
67 | | - "172.28.5.81", |
68 | | - "172.28.27.31", |
69 | | - "172.28.11.182", |
70 | | - "172.28.218.214", |
71 | | - "172.28.133.166", |
72 | | - "172.28.209.155", |
73 | | - "172.28.219.150", |
74 | | - "172.28.12.45", |
75 | | - "172.28.222.131", |
76 | | - "172.28.132.105", |
77 | | - "172.28.198.83", |
78 | | - "172.28.6.47", |
79 | | - "172.28.211.200", |
80 | | - "172.28.14.161", |
81 | | - "172.28.128.124", |
82 | | - "172.28.212.131", |
83 | | - "172.28.211.212", |
84 | | - "172.28.11.150", |
85 | | - "172.28.130.107", |
86 | | - "172.28.131.186", |
87 | | - "172.28.212.85", |
88 | | - "172.28.2.148", |
89 | | - "172.28.194.173", |
90 | | - "172.28.214.23", |
91 | | - "172.28.16.212", |
92 | | - "172.28.15.104", |
93 | | - "172.28.128.218", |
94 | | - "70.98.1.1", |
95 | | - "66.200.1.1", |
96 | | - "24.145.1.1", |
97 | | - "64.180.1.1", |
98 | | - "172.28.3.248", |
99 | | - "172.28.8.210", |
100 | | - "172.28.132.25", |
101 | | - "172.28.22.122", |
102 | | - "172.28.198.147", |
103 | | - "172.28.2.90", |
104 | | - "172.28.3.39", |
105 | | - "172.28.6.48", |
106 | | - "172.28.130.83", |
107 | | - "172.28.6.170", |
108 | | - "44.29.203.5", |
109 | | - "123.44.92.173", |
110 | | - "64.222.102.58", |
111 | | - ] |
112 | | - |
113 | | - # Convert IPs to integers using the mapping and drop missing |
114 | | - ddos_ids = {ip_map[ip] for ip in extracted_ips if ip in ip_map} |
115 | | - |
116 | | - # Ensure we start fresh |
117 | | - if os.path.exists(args.output): |
118 | | - os.remove(args.output) |
119 | | - |
120 | | - print(f"Processing {len(args.input)} input file(s)...") |
121 | | - print(f"DDoS IPs to filter: {len(ddos_ids)} unique IPs") |
122 | | - |
123 | | - overall_start_time = time.time() |
124 | | - overall_total_rows = 0 |
125 | | - overall_total_matched = 0 |
126 | | - |
127 | | - # Process each input file |
128 | | - for file_index, input_file in enumerate(args.input, start=1): |
129 | | - print(f"\n[{file_index}/{len(args.input)}] Reading '{input_file}' in chunks of {args.chunksize:,} rows...") |
130 | | - |
131 | | - if not os.path.exists(input_file): |
132 | | - print(f" WARNING: File not found, skipping: {input_file}") |
133 | | - continue |
134 | | - |
135 | | - file_start_time = time.time() |
136 | | - file_total_rows = 0 |
137 | | - file_total_matched = 0 |
138 | | - chunk_index = 0 |
139 | | - |
140 | | - # Stream input CSV in chunks and write matching rows incrementally |
141 | | - for chunk in pd.read_csv(input_file, chunksize=args.chunksize): |
142 | | - chunk_index += 1 |
143 | | - file_total_rows += len(chunk) |
144 | | - overall_total_rows += len(chunk) |
145 | | - |
146 | | - if "src_ip" not in chunk.columns or "dst_ip" not in chunk.columns: |
147 | | - # If required columns are missing, skip this chunk |
148 | | - continue |
149 | | - |
150 | | - # src_ip and dst_ip might be floats or strings; coerce to nullable Int64 before matching |
151 | | - src_ids = pd.to_numeric(chunk["src_ip"], errors="coerce").astype("Int64") |
152 | | - dst_ids = pd.to_numeric(chunk["dst_ip"], errors="coerce").astype("Int64") |
153 | | - mask = src_ids.isin(ddos_ids) | dst_ids.isin(ddos_ids) |
154 | | - filtered_chunk = chunk[mask] |
155 | | - matched_count = len(filtered_chunk) |
156 | | - file_total_matched += matched_count |
157 | | - overall_total_matched += matched_count |
158 | | - |
159 | | - if not filtered_chunk.empty: |
160 | | - # Write header only on first write (file doesn't exist because we removed it) |
161 | | - write_header = not os.path.exists(args.output) |
162 | | - filtered_chunk.to_csv(args.output, index=False, mode="a", header=write_header) |
163 | | - |
164 | | - print( |
165 | | - f" Chunk {chunk_index}: processed {len(chunk):,} rows (file total {file_total_rows:,}); matched {matched_count:,} (file total {file_total_matched:,})" |
166 | | - ) |
167 | | - |
168 | | - file_elapsed = time.time() - file_start_time |
169 | | - print(f" File complete: {file_total_rows:,} rows processed, {file_total_matched:,} matched in {file_elapsed:.2f}s") |
170 | | - |
171 | | - overall_elapsed = time.time() - overall_start_time |
172 | | - print(f"\n{'='*80}") |
173 | | - if os.path.exists(args.output): |
174 | | - print( |
175 | | - f"All files processed. Wrote {overall_total_matched:,} matched rows to '{args.output}' in {overall_elapsed:.2f}s." |
176 | | - ) |
177 | | - print(f"Total rows processed: {overall_total_rows:,}") |
178 | | - else: |
179 | | - print( |
180 | | - f"All files processed. No matches found after processing {overall_total_rows:,} rows in {overall_elapsed:.2f}s." |
181 | | - ) |
182 | | - |
183 | | - |
184 | | -if __name__ == "__main__": |
185 | | - main() |
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import os |
| 4 | +import time |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | + |
| 8 | + |
| 9 | +def parse_args() -> argparse.Namespace: |
| 10 | + parser = argparse.ArgumentParser(description="Filter large CSV by DDOS IPs using chunked processing") |
| 11 | + parser.add_argument( |
| 12 | + "input", |
| 13 | + nargs='+', |
| 14 | + help="Path(s) to input CSV file(s) - can specify multiple files", |
| 15 | + ) |
| 16 | + parser.add_argument( |
| 17 | + "--output", |
| 18 | + default="decoded_pcap_data_set1_full_ddos.csv", |
| 19 | + help="Path to output CSV file", |
| 20 | + ) |
| 21 | + parser.add_argument( |
| 22 | + "--ip-map", |
| 23 | + default="full_ip_map.json", |
| 24 | + dest="ip_map_path", |
| 25 | + help="Path to JSON mapping of IP string to integer id", |
| 26 | + ) |
| 27 | + parser.add_argument( |
| 28 | + "--chunksize", |
| 29 | + type=int, |
| 30 | + default=1000000, |
| 31 | + help="Number of rows per chunk when reading the input CSV", |
| 32 | + ) |
| 33 | + return parser.parse_args() |
| 34 | + |
| 35 | + |
| 36 | +def main() -> None: |
| 37 | + args = parse_args() |
| 38 | + |
| 39 | + # Load the mapping file |
| 40 | + with open(args.ip_map_path, "r") as f: |
| 41 | + ip_map = json.load(f) |
| 42 | + |
| 43 | + # Default IPs extracted from the provided image; used if no IPs are passed |
| 44 | + extracted_ips = [ |
| 45 | + "172.28.3.121", |
| 46 | + "172.28.213.189", |
| 47 | + "172.28.214.102", |
| 48 | + "172.28.23.163", |
| 49 | + "172.28.197.120", |
| 50 | + "77.91.104.22", |
| 51 | + "201.89.32.16", |
| 52 | + "68.91.226.37", |
| 53 | + "172.28.3.242", |
| 54 | + "172.28.16.39", |
| 55 | + "172.28.3.56", |
| 56 | + "172.28.13.198", |
| 57 | + "172.28.133.17", |
| 58 | + "172.28.220.39", |
| 59 | + "172.28.12.167", |
| 60 | + "172.28.13.29", |
| 61 | + "172.28.6.9", |
| 62 | + "172.28.27.162", |
| 63 | + "172.28.197.210", |
| 64 | + "172.28.196.167", |
| 65 | + "172.28.212.207", |
| 66 | + "172.28.218.87", |
| 67 | + "172.28.5.81", |
| 68 | + "172.28.27.31", |
| 69 | + "172.28.11.182", |
| 70 | + "172.28.218.214", |
| 71 | + "172.28.133.166", |
| 72 | + "172.28.209.155", |
| 73 | + "172.28.219.150", |
| 74 | + "172.28.12.45", |
| 75 | + "172.28.222.131", |
| 76 | + "172.28.132.105", |
| 77 | + "172.28.198.83", |
| 78 | + "172.28.6.47", |
| 79 | + "172.28.211.200", |
| 80 | + "172.28.14.161", |
| 81 | + "172.28.128.124", |
| 82 | + "172.28.212.131", |
| 83 | + "172.28.211.212", |
| 84 | + "172.28.11.150", |
| 85 | + "172.28.130.107", |
| 86 | + "172.28.131.186", |
| 87 | + "172.28.212.85", |
| 88 | + "172.28.2.148", |
| 89 | + "172.28.194.173", |
| 90 | + "172.28.214.23", |
| 91 | + "172.28.16.212", |
| 92 | + "172.28.15.104", |
| 93 | + "172.28.128.218", |
| 94 | + "70.98.1.1", |
| 95 | + "66.200.1.1", |
| 96 | + "24.145.1.1", |
| 97 | + "64.180.1.1", |
| 98 | + "172.28.3.248", |
| 99 | + "172.28.8.210", |
| 100 | + "172.28.132.25", |
| 101 | + "172.28.22.122", |
| 102 | + "172.28.198.147", |
| 103 | + "172.28.2.90", |
| 104 | + "172.28.3.39", |
| 105 | + "172.28.6.48", |
| 106 | + "172.28.130.83", |
| 107 | + "172.28.6.170", |
| 108 | + "44.29.203.5", |
| 109 | + "123.44.92.173", |
| 110 | + "64.222.102.58", |
| 111 | + ] |
| 112 | + |
| 113 | + # Convert IPs to integers using the mapping and drop missing |
| 114 | + ddos_ids = {ip_map[ip] for ip in extracted_ips if ip in ip_map} |
| 115 | + |
| 116 | + # Ensure we start fresh |
| 117 | + if os.path.exists(args.output): |
| 118 | + os.remove(args.output) |
| 119 | + |
| 120 | + print(f"Processing {len(args.input)} input file(s)...") |
| 121 | + print(f"DDoS IPs to filter: {len(ddos_ids)} unique IPs") |
| 122 | + |
| 123 | + overall_start_time = time.time() |
| 124 | + overall_total_rows = 0 |
| 125 | + overall_total_matched = 0 |
| 126 | + |
| 127 | + # Process each input file |
| 128 | + for file_index, input_file in enumerate(args.input, start=1): |
| 129 | + print(f"\n[{file_index}/{len(args.input)}] Reading '{input_file}' in chunks of {args.chunksize:,} rows...") |
| 130 | + |
| 131 | + if not os.path.exists(input_file): |
| 132 | + print(f" WARNING: File not found, skipping: {input_file}") |
| 133 | + continue |
| 134 | + |
| 135 | + file_start_time = time.time() |
| 136 | + file_total_rows = 0 |
| 137 | + file_total_matched = 0 |
| 138 | + chunk_index = 0 |
| 139 | + |
| 140 | + # Stream input CSV in chunks and write matching rows incrementally |
| 141 | + for chunk in pd.read_csv(input_file, chunksize=args.chunksize): |
| 142 | + chunk_index += 1 |
| 143 | + file_total_rows += len(chunk) |
| 144 | + overall_total_rows += len(chunk) |
| 145 | + |
| 146 | + if "src_ip" not in chunk.columns or "dst_ip" not in chunk.columns: |
| 147 | + # If required columns are missing, skip this chunk |
| 148 | + continue |
| 149 | + |
| 150 | + # src_ip and dst_ip might be floats or strings; coerce to nullable Int64 before matching |
| 151 | + src_ids = pd.to_numeric(chunk["src_ip"], errors="coerce").astype("Int64") |
| 152 | + dst_ids = pd.to_numeric(chunk["dst_ip"], errors="coerce").astype("Int64") |
| 153 | + mask = src_ids.isin(ddos_ids) | dst_ids.isin(ddos_ids) |
| 154 | + filtered_chunk = chunk[mask] |
| 155 | + matched_count = len(filtered_chunk) |
| 156 | + file_total_matched += matched_count |
| 157 | + overall_total_matched += matched_count |
| 158 | + |
| 159 | + if not filtered_chunk.empty: |
| 160 | + # Write header only on first write (file doesn't exist because we removed it) |
| 161 | + write_header = not os.path.exists(args.output) |
| 162 | + filtered_chunk.to_csv(args.output, index=False, mode="a", header=write_header) |
| 163 | + |
| 164 | + print( |
| 165 | + f" Chunk {chunk_index}: processed {len(chunk):,} rows (file total {file_total_rows:,}); matched {matched_count:,} (file total {file_total_matched:,})" |
| 166 | + ) |
| 167 | + |
| 168 | + file_elapsed = time.time() - file_start_time |
| 169 | + print(f" File complete: {file_total_rows:,} rows processed, {file_total_matched:,} matched in {file_elapsed:.2f}s") |
| 170 | + |
| 171 | + overall_elapsed = time.time() - overall_start_time |
| 172 | + print(f"\n{'='*80}") |
| 173 | + if os.path.exists(args.output): |
| 174 | + print( |
| 175 | + f"All files processed. Wrote {overall_total_matched:,} matched rows to '{args.output}' in {overall_elapsed:.2f}s." |
| 176 | + ) |
| 177 | + print(f"Total rows processed: {overall_total_rows:,}") |
| 178 | + else: |
| 179 | + print( |
| 180 | + f"All files processed. No matches found after processing {overall_total_rows:,} rows in {overall_elapsed:.2f}s." |
| 181 | + ) |
| 182 | + |
| 183 | + |
| 184 | +if __name__ == "__main__": |
| 185 | + main() |
0 commit comments