3535 NECESSARY_STRING = False
3636else :
3737 NECESSARY_STRING = os .environ ['NECESSARY_STRING' ]
38+ if 'DOWNLOAD_FILES' not in os .environ :
39+ DOWNLOAD_FILES = False
40+ else :
41+ DOWNLOAD_FILES = os .environ ['DOWNLOAD_FILES' ]
42+
43+ localIn = '/home/ubuntu/local_input'
44+
3845
3946#################################
4047# CLASS TO HANDLE THE SQS QUEUE
@@ -159,8 +166,54 @@ def runCellProfiler(message):
159166 logger .removeHandler (watchtowerlogger )
160167 return 'SUCCESS'
161168 except KeyError : #Returned if that folder does not exist
162- pass
163-
169+ pass
170+
171+ csv_name = os .path .join (DATA_ROOT ,message ['data_file' ])
172+
173+ # Optional- download files
174+ if DOWNLOAD_FILES :
175+ if DOWNLOAD_FILES .lower () == 'true' :
176+ printandlog ('Figuring which files to download' , logger )
177+ import pandas
178+ s3 = boto3 .resource ('s3' )
179+ if not os .path .exists (localIn ):
180+ os .mkdir (localIn )
181+ csv_in = pandas .read_csv (os .path .join (DATA_ROOT ,message ['data_file' ]))
182+ csv_in = csv_in .astype ('str' )
183+ #Figure out what metadata fields we need in this experiment, as a dict
184+ if type (message ['Metadata' ])== dict :
185+ filter_dict = message ['Metadata' ]
186+ else :
187+ filter_dict = {}
188+ for eachMetadata in message ['Metadata' ].split (',' ):
189+ filterkey , filterval = eachMetadata .split ('=' )
190+ filter_dict [filterkey ] = filterval
191+ #Filter our CSV to just the rows CellProfiler will process, so that we can download only what we need
192+ for eachfilter in filter_dict .keys ():
193+ csv_in = csv_in [csv_in [eachfilter ] == filter_dict [eachfilter ]]
194+ #Figure out the actual file names and get them
195+ channel_list = [x .split ('FileName_' )[1 ] for x in csv_in .columns if 'FileName' in x ]
196+ count = 0
197+ printandlog ('Downloading files' , logger )
198+ for channel in channel_list :
199+ for field in range (csv_in .shape [0 ]):
200+ full_old_file_name = os .path .join (list (csv_in ['PathName_' + channel ])[field ],list (csv_in ['FileName_' + channel ])[field ])
201+ prefix_on_bucket = full_old_file_name .split (DATA_ROOT )[1 ][1 :]
202+ new_file_name = os .path .join (localIn ,prefix_on_bucket )
203+ if not os .path .exists (os .path .split (new_file_name )[0 ]):
204+ os .makedirs (os .path .split (new_file_name )[0 ])
205+ printandlog ('made directory ' + os .path .split (new_file_name )[0 ],logger )
206+ s3 .meta .client .download_file (AWS_BUCKET ,prefix_on_bucket ,new_file_name )
207+ count += 1
208+ printandlog ('Downloaded ' + str (count )+ ' files' ,logger )
209+ local_csv_name = os .path .join (localIn ,os .path .split (csv_name )[1 ])
210+ if not os .path .exists (local_csv_name ):
211+ csv_in = pandas .read_csv (os .path .join (DATA_ROOT ,message ['data_file' ]))
212+ csv_in .replace (DATA_ROOT ,localIn ,regex = True , inplace = True )
213+ csv_in .to_csv (local_csv_name ,index = False )
214+ print ('Wrote updated CSV' )
215+ csv_name = local_csv_name
216+
164217 # Build and run CellProfiler command
165218 cpDone = localOut + '/cp.is.done'
166219 cp2 = False
@@ -173,7 +226,7 @@ def runCellProfiler(message):
173226 cmdstem = 'cellprofiler -c -r '
174227 if message ['pipeline' ][- 3 :]!= '.h5' :
175228 cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone
176- cmd += ' --data-file=%(DATA)s/%(FL)s '
229+ cmd += ' --data-file=' + csv_name + ' '
177230 cmd += '-g %(Metadata)s'
178231 else :
179232 cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone + ' -g %(Metadata)s'
@@ -189,6 +242,9 @@ def runCellProfiler(message):
189242 # Get the outputs and move them to S3
190243 if os .path .isfile (cpDone ):
191244 time .sleep (30 )
245+ if os .path .exists (localIn ):
246+ import shutil
247+ shutil .rmtree (localIn , ignore_errors = True )
192248 mvtries = 0
193249 while mvtries < 3 :
194250 try :
0 commit comments