1+ import sys , os
2+ import azure_chestxray_utils
3+ import pickle
4+ import random
5+ import re
6+ import tqdm
7+ import cv2
8+ import numpy as np
9+ import pandas as pd
10+ import sklearn .model_selection
11+ from collections import Counter
12+
13+ paths_to_append = [os .path .join (os .getcwd (), os .path .join (* (['Code' , 'src' ])))]
14+ def add_path_to_sys_path (path_to_append ):
15+ if not (any (path_to_append in paths for paths in sys .path )):
16+ sys .path .append (path_to_append )
17+
18+ [add_path_to_sys_path (crt_path ) for crt_path in paths_to_append ]
19+
20+ path = os .getcwd ()+ r'\azure-share'
21+ isExists = os .path .exists (path )
22+ if not isExists :
23+ amlWBSharedDir = os .mkdir (path )
24+ else :
25+ amlWBSharedDir = path
26+
27+
28+
29+
30+ prj_consts = azure_chestxray_utils .chestxray_consts ()
31+ print (prj_consts )
32+
33+ data_base_input_dir = os .path .join (amlWBSharedDir , os .path .join (* (prj_consts .BASE_INPUT_DIR_list )))
34+ data_base_output_dir = os .path .join (amlWBSharedDir , os .path .join (* (prj_consts .BASE_OUTPUT_DIR_list )))
35+
36+ isExists1 = os .path .exists (data_base_input_dir )
37+ isExists2 = os .path .exists (data_base_output_dir )
38+
39+ if not isExists1 :
40+ data_base_input_dir = os .mkdir (data_base_input_dir )
41+ print (data_base_input_dir )
42+
43+ if not isExists2 :
44+ data_base_output_dir = os .mkdir (data_base_output_dir )
45+ print (data_base_output_dir )
46+
47+ nih_chest_xray_data_dir = os .path .join (data_base_input_dir ,
48+ os .path .join (* (prj_consts .ChestXray_IMAGES_DIR_list )))
49+ isExists3 = os .path .exists (nih_chest_xray_data_dir )
50+ if not isExists3 :
51+ nih_chest_xray_data_dir = os .mkdir (nih_chest_xray_data_dir )
52+
53+ print (nih_chest_xray_data_dir )
54+
55+ other_data_dir = os .path .join (data_base_input_dir , os .path .join (* (prj_consts .ChestXray_OTHER_DATA_DIR_list )))
56+ data_partitions_dir = os .path .join (data_base_output_dir , os .path .join (* (prj_consts .DATA_PARTITIONS_DIR_list )))
57+
58+ ignored_images_set = set ()
59+
60+ total_patient_number = 30805
61+ NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists
62+ manually_selected_bad_images_file = 'blacklist.csv' # exclude what viusally looks like bad images
63+
64+ patient_id_original = [i for i in range (1 ,total_patient_number + 1 )]
65+
66+ bbox_df = pd .read_csv (os .path .join (other_data_dir , NIH_annotated_file ))
67+ bbox_patient_index_df = bbox_df ['Image Index' ].str .slice (3 , 8 )
68+
69+ bbox_patient_index_list = []
70+ for index , item in bbox_patient_index_df .iteritems ():
71+ bbox_patient_index_list .append (int (item ))
72+
73+ patient_id = list (set (patient_id_original ) - set (bbox_patient_index_list ))
74+ print ("len of original patient id is" , len (patient_id_original ))
75+ print ("len of cleaned patient id is" , len (patient_id ))
76+ print ("len of unique patient id with annotated data" ,
77+ len (list (set (bbox_patient_index_list ))))
78+ print ("len of patient id with annotated data" ,bbox_df .shape [0 ])
79+
80+ random .seed (0 )
81+ random .shuffle (patient_id )
82+
83+ print ("first ten patient ids are" , patient_id [:10 ])
84+
85+ # training:valid:test=7:1:2
86+ patient_id_train = patient_id [:int (total_patient_number * 0.7 )]
87+ patient_id_valid = patient_id [int (total_patient_number * 0.7 ):int (total_patient_number * 0.8 )]
88+ # get the rest of the patient_id as the test set
89+ patient_id_test = patient_id [int (total_patient_number * 0.8 ):]
90+ patient_id_test .extend (bbox_patient_index_list )
91+ patient_id_test = list (set (patient_id_test ))
92+
93+ print ("train:{} valid:{} test:{}" .format (len (patient_id_train ), len (patient_id_valid ), len (patient_id_test )))
94+
95+ pathologies_name_list = prj_consts .DISEASE_list
96+ NIH_patients_and_labels_file = 'Data_Entry_2017.csv'
97+
98+ labels_df = pd .read_csv (os .path .join (other_data_dir , NIH_patients_and_labels_file ))
99+
100+
101+ #show the label distribution
102+
103+ # Unique IDs frequencies can be computed using list comprehension or collections lib
104+ # [[x,(list(crtData['fullID2'])).count(x)] for x in set(crtData['fullID2'])]
105+ # for tallying, collections lib is faster than list comprehension
106+ pathology_distribution = Counter (list (labels_df ['Finding Labels' ]))
107+
108+ # Sort it by ID frequency (dict value)
109+ sorted_by_freq = sorted (pathology_distribution .items (), key = lambda x : x [1 ], reverse = True )
110+ print (len (sorted_by_freq ))
111+ print (sorted_by_freq [:20 ])
112+ print (sorted_by_freq [- 10 :])
113+
114+ print (labels_df ['Finding Labels' ].str .split ( '|' , expand = False ).str .join (sep = '*' ).str .get_dummies (sep = '*' ).sum ())
115+
116+ def process_data (current_df , patient_ids ):
117+ image_name_index = []
118+ image_labels = {}
119+ for individual_patient in tqdm .tqdm (patient_ids ):
120+ for _ , row in current_df [current_df ['Patient ID' ] == individual_patient ].iterrows ():
121+ processed_image_name = row ['Image Index' ]
122+ if processed_image_name in ignored_images_set :
123+ pass
124+ else :
125+ image_name_index .append (processed_image_name )
126+ image_labels [processed_image_name ] = np .zeros (14 , dtype = np .uint8 )
127+ for disease_index , ele in enumerate (pathologies_name_list ):
128+ if re .search (ele , row ['Finding Labels' ], re .IGNORECASE ):
129+ image_labels [processed_image_name ][disease_index ] = 1
130+ else :
131+ # redundant code but just to make it more readable
132+ image_labels [processed_image_name ][disease_index ] = 0
133+ # print("processed", row['Image Index'])
134+ return image_name_index , image_labels
135+
136+
137+ train_data_index , train_labels = process_data (labels_df , patient_id_train )
138+ valid_data_index , valid_labels = process_data (labels_df , patient_id_valid )
139+ test_data_index , test_labels = process_data (labels_df , patient_id_test )
140+
141+ print ("train, valid, test image number is:" , len (train_data_index ), len (valid_data_index ), len (test_data_index ))
142+
143+ # save the data
144+ labels_all = {}
145+ labels_all .update (train_labels )
146+ labels_all .update (valid_labels )
147+ labels_all .update (test_labels )
148+
149+ partition_dict = {'train' : train_data_index , 'test' : test_data_index , 'valid' : valid_data_index }
150+
151+ with open (os .path .join (data_partitions_dir , 'labels14_unormalized_cleaned.pickle' ), 'wb' ) as f :
152+ pickle .dump (labels_all , f )
153+
154+ with open (os .path .join (data_partitions_dir , 'partition14_unormalized_cleaned.pickle' ), 'wb' ) as f :
155+ pickle .dump (partition_dict , f )
156+
157+ # also save the patient id partitions for pytorch training
158+ with open (os .path .join (data_partitions_dir , 'train_test_valid_data_partitions.pickle' ), 'wb' ) as f :
159+ pickle .dump ([patient_id_train , patient_id_valid ,
160+ patient_id_test ,
161+ list (set (bbox_patient_index_list ))], f )
162+
163+ print (type (train_labels ))
164+ print ({k : train_labels [k ] for k in list (train_labels )[:5 ]})
0 commit comments