Merge pull request #10 from scitran/cgc/label-matcher

cgc · web-flow · commit cd49b3a778ae · 2017-04-14T18:21:45.000-07:00
Match analysis more flexibly.
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 test:
-	# hack to avoid needing to install this package
+	# hack to avoid needing to install this client as a package
 	# from http://stackoverflow.com/a/34140498
-	python -m pytest tests
+	python -m pytest tests -v
 
 lint:
 	flake8 examples scitran_client
diff --git a/README.md b/README.md
@@ -54,6 +54,11 @@ Lint your code with
 make lint
 ```
 
+Test your code with
+```bash
+make test
+```
+
 Publish a new version of the docs with
 ```bash
 make publish_docs
diff --git a/examples/flywheel_analyzer_afq.py b/examples/flywheel_analyzer_afq.py
@@ -1,8 +1,19 @@
 import scitran_client.flywheel_analyzer as fa
+from scitran_client import ScitranClient
+
+client = ScitranClient('https://flywheel-cni.scitran.stanford.edu')
+
+
+def prefix_matcher(prefix):
+    # doing this funny prefix matching to catch both "afq" and "afq 2017-01-01..."
+    return lambda val: val == prefix or val.startswith(prefix + ' ')
+
+dtiinit_matcher = prefix_matcher('dtiinit')
+afq_matcher = prefix_matcher('afq')
 
 
 def dtiinit_inputs(acquisitions, **kwargs):
-    diffusion = fa.find(acquisitions, measurement='diffusion')
+    diffusion = fa.find(acquisitions, label='DTI 2mm b1250 84dir(axial)')
 
     return dict(
         bvec=diffusion.find_file('*.bvec'),
@@ -12,15 +23,19 @@ def dtiinit_inputs(acquisitions, **kwargs):
 
 
 def afq_inputs(analyses, **kwargs):
-    dtiinit = fa.find(analyses, label='dtiinit')
+    dtiinit = fa.find(analyses, label=dtiinit_matcher)
 
     return dict(
         dtiInit_Archive=dtiinit.find_file('dtiInit_*.zip'),
     )
 
 if __name__ == '__main__':
-    with fa.installed_client():
+    with fa.installed_client(client):
         fa.run([
-            fa.define_analysis('dtiinit', dtiinit_inputs),
-            fa.define_analysis('afq', afq_inputs),
+            fa.define_analysis(
+                'dtiinit', dtiinit_inputs,
+                label_matcher=dtiinit_matcher),
+            fa.define_analysis(
+                'afq', afq_inputs,
+                label_matcher=afq_matcher),
         ], project=fa.find_project(label='ENGAGE'))
diff --git a/examples/flywheel_analyzer_engage.py b/examples/flywheel_analyzer_engage.py
@@ -1,6 +1,26 @@
 import scitran_client.flywheel_analyzer as fa
 from scitran_client import ScitranClient
 
+client = ScitranClient('https://flywheel-cni.scitran.stanford.edu')
+with fa.installed_client(client):
+    project = fa.find_project(label='ENGAGE')
+    sessions = client.request('projects/{}/sessions'.format(project['_id'])).json()
+    session_by_subject = {}
+    second_to_first_visit_id = {}
+    for s in sessions:
+        subject = s['subject']['code'][:7].upper()
+        session_by_subject.setdefault(subject, []).append(s)
+    for subject, subject_sessions in session_by_subject.iteritems():
+        # we need at least two sessions
+        if len(subject_sessions) < 2:
+            continue
+        subject_sessions.sort(key=lambda s: s['timestamp'])
+        # HACK this is a bit of a heuristic. this will certainly fail
+        # for some folks that skipped a BV, or folks that missed the 2mo
+        # but hopefully, those folks will otherwise have data that is just
+        # fine.
+        second_to_first_visit_id[subject_sessions[1]['_id']] = subject_sessions[0]['_id']
+
 
 # XXX at least make this be just the first thing without ' 2'?
 label_to_task_type = {
@@ -39,18 +59,28 @@ def define_analysis(gear_name, acquisition_label, create_inputs):
         label=analysis_label(gear_name, acquisition_label))
 
 
-def reactivity_inputs(acquisition_label, acquisitions, **kwargs):
-    functional = fa.find(acquisitions, label=acquisition_label)
+def reactivity_inputs(acquisition_label, acquisitions, session, **kwargs):
+    functional = fa.find_required_input_source(acquisitions, label=acquisition_label)
+    # using plain find() here b/c this T1w might be missing
     structural = fa.find(acquisitions, label='T1w 1mm')
+    if not structural:
+        assert session['_id'] in second_to_first_visit_id,\
+            'the only sessions that should be missing T1w are second visits. {} was missing a T1w'\
+            .format(session['_id'])
+        first_visit_session_id = second_to_first_visit_id[session['_id']]
+        first_visit_acquisitions = client.request(
+            'sessions/{}/acquisitions'.format(first_visit_session_id)).json()
+        structural = fa.find(first_visit_acquisitions, label='T1w 1mm')
+        assert structural, 'Session {} is missing a structural.'.format(session['_id'])
 
     return dict(
         functional=functional.find_file('*.nii.gz'),
         structural=structural.find_file('*.nii.gz'),
     )
 
 
-def connectivity_inputs(acquisition_label, analyses, acquisitions):
-    reactivity = fa.find(
+def connectivity_inputs(acquisition_label, analyses, **kwargs):
+    reactivity = fa.find_required_input_source(
         analyses, label=analysis_label('reactivity-preprocessing', acquisition_label))
 
     return dict(
@@ -60,12 +90,12 @@ def connectivity_inputs(acquisition_label, analyses, acquisitions):
     )
 
 
-def first_level_model_inputs(acquisition_label, analyses, acquisitions):
-    reactivity = fa.find(
+def first_level_model_inputs(acquisition_label, analyses, acquisitions, **kwargs):
+    reactivity = fa.find_required_input_source(
         analyses, label=analysis_label('reactivity-preprocessing', acquisition_label))
-    connectivity = fa.find(
+    connectivity = fa.find_required_input_source(
         analyses, label=analysis_label('connectivity-preprocessing', acquisition_label))
-    behavioral = fa.find(
+    behavioral = fa.find_required_input_source(
         acquisitions, label='Behavioral and Physiological')
 
     return dict(
@@ -81,7 +111,7 @@ def first_level_model_inputs(acquisition_label, analyses, acquisitions):
     ), dict(task_type=label_to_task_type[acquisition_label])
 
 if __name__ == '__main__':
-    with fa.installed_client(ScitranClient('https://flywheel-cni.scitran.stanford.edu')):
+    with fa.installed_client(client):
         fa.run([
             define_analysis('reactivity-preprocessing', 'go-no-go 2', reactivity_inputs),
             define_analysis('connectivity-preprocessing', 'go-no-go 2', connectivity_inputs),
@@ -94,4 +124,8 @@ def first_level_model_inputs(acquisition_label, analyses, acquisitions):
             define_analysis('reactivity-preprocessing', 'nonconscious 2', reactivity_inputs),
             define_analysis('connectivity-preprocessing', 'nonconscious 2', connectivity_inputs),
             define_analysis('first-level-models', 'nonconscious 2', first_level_model_inputs),
-        ], project=fa.find_project(label='ENGAGE'), session_limit=1)
+
+            define_analysis('reactivity-preprocessing', 'EmoReg', reactivity_inputs),
+            define_analysis('connectivity-preprocessing', 'EmoReg', connectivity_inputs),
+            # define_analysis('first-level-models', 'EmoReg', first_level_model_inputs),
+        ], project=project)
diff --git a/scitran_client/flywheel_analyzer.py b/scitran_client/flywheel_analyzer.py
@@ -5,9 +5,10 @@
 from concurrent.futures import ThreadPoolExecutor, CancelledError
 import traceback
 from fnmatch import fnmatch
-from collections import namedtuple, Counter
+from collections import namedtuple
 import math
 from contextlib import contextmanager
+import os
 
 
 def _sleep(seconds):
@@ -23,10 +24,10 @@ def _sleep(seconds):
 
 
 FlywheelAnalysisOperation = namedtuple('FlywheelAnalysisOperation', [
-    'gear_name', 'create_inputs', 'label'])
+    'gear_name', 'create_inputs', 'label', 'label_matcher'])
 
 
-def define_analysis(gear_name, create_inputs, label=None):
+def define_analysis(gear_name, create_inputs, label=None, label_matcher=None):
     '''Defines an analysis operation that can be passed to run(...).
 
     An analysis operation has a gear name, label (which defaults to
@@ -38,7 +39,10 @@ def define_analysis(gear_name, create_inputs, label=None):
     inputs (to override the default config).
     '''
     label = label or gear_name
-    return FlywheelAnalysisOperation(gear_name, create_inputs, label)
+    label_matcher = label_matcher or label
+    assert find([dict(label=label)], label=label_matcher),\
+        'Label matcher for operation {} does not detect this operation.'.format(label)
+    return FlywheelAnalysisOperation(gear_name, create_inputs, label, label_matcher)
 
 
 class FlywheelFileContainer(dict):
@@ -105,11 +109,24 @@ def find(items, _constructor_=FlywheelFileContainer, **kwargs):
     # TODO make this have better errors messages for missing files
     result = next((
         item for item in items
-        if all(item[k] == v for k, v in kwargs.iteritems())
+        if all(
+            v(item[k]) if callable(v) else item[k] == v
+            for k, v in kwargs.iteritems()
+        )
     ), None)
     return result and _constructor_(result)
 
 
+def find_required_input_source(items, **kwargs):
+    '''Finds a match to `kwargs` in `items` by using `find()`. If this match is not
+    found, the current operation will be skipped.
+    '''
+    result = find(items, **kwargs)
+    if not result:
+        raise SkipOperation('could not find match to {}'.format(kwargs))
+    return result
+
+
 def find_project(**kwargs):
     '''Finds a project that matches the key, value pairs in `kwargs`.
 
@@ -123,6 +140,19 @@ class ShuttingDownException(Exception):
     shutting_down = False
 
 
+class SkipOperation(Exception):
+    '''
+    SkipOperation can be thrown from a `create_inputs` function to skip the execution of that
+    operation. This is a way to more dynamically create operation graphs by discarding nodes
+    at runtime.
+
+    For example, if every session has a variable number of functional acquisitions that need to be
+    processed, you can define operations for the max number of per-session functional acquisitions,
+    and throw SkipOperation for all operations corresponding to acquisitions missing for a session.
+    '''
+    pass
+
+
 def request(*args, **kwargs):
     # HACK client is a module variable for now. In the future, we should pass client around.
     assert 'client' in state, 'client must be installed in state before using request. See `installed_client`.'
@@ -187,8 +217,8 @@ def _analyze_session(operations, gears_by_name, session):
     acquisitions = None
     session_id = session['_id']
     analyses = _get_analyses(session_id)
-    for gear_name, create_inputs, label in operations:
-        analysis = find(analyses, label=label)
+    for gear_name, create_inputs, label, label_matcher in operations:
+        analysis = find(analyses, label=label_matcher)
 
         # skip this analysis if we've already done it
         if analysis and analysis['job']['state'] == 'complete':
@@ -201,7 +231,12 @@ def _analyze_session(operations, gears_by_name, session):
             # have completed analysis
             if not acquisitions:
                 acquisitions = request('sessions/{}/acquisitions'.format(session_id))
-            job_inputs = create_inputs(analyses=analyses, acquisitions=acquisitions)
+            try:
+                job_inputs = create_inputs(analyses=analyses, acquisitions=acquisitions, session=session)
+            except SkipOperation:
+                # we skip to the next operation
+                continue
+
             job_config = _defaults_for_gear(gears_by_name[gear_name])
 
             # When create_inputs returns a tuple, we unpack it into job_inputs and job_config.
@@ -211,7 +246,7 @@ def _analyze_session(operations, gears_by_name, session):
                 job_inputs, job_config = job_inputs[0], dict(job_config, **job_inputs[1])
             _submit_analysis(session_id, gear_name, job_inputs, job_config, label)
 
-        analyses = _wait_for_analysis(session_id, label)
+        analyses = _wait_for_analysis(session_id, label_matcher)
     print(session_id, 'all analysis complete')
 
 
@@ -225,6 +260,7 @@ def done(f):
         except (ShuttingDownException, CancelledError):
             pass
         except Exception:
+            print('error with {}'.format(f.name))
             traceback.print_exc()
 
     for future in futures:
@@ -270,13 +306,23 @@ def run(operations, project=None, max_workers=10, session_limit=None):
         will use and how many CPUs you can use from your Flywheel Engine instance.
     session_limit - Used to test pipelines out by limiting the number of sessions
         the pipeline code will run on.
+
+    Enabling status mode - By setting the environment variable
+    FLYWHEEL_ANALYZER_STATUS to `true`, this method will only print the status
+    of this pipeline. It will not run anything.
     """
     gears = [g['gear'] for g in request('gears', params=dict(fields='all'))]
     gears_by_name = {
         gear['name']: gear
         for gear in gears
     }
 
+    # HACK this is seriously a total hack, but is a nice way to see the status
+    # of a pipeline without editing code.
+    if os.environ.get('FLYWHEEL_ANALYZER_STATUS', '').lower() == 'true':
+        status(operations, project)
+        return
+
     for operation in operations:
         assert operation.gear_name in gears_by_name,\
             'operation(name={}, label={}) has an invalid name.'.format(
@@ -295,28 +341,37 @@ def run(operations, project=None, max_workers=10, session_limit=None):
         sessions = sessions[:session_limit]
 
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = [
-            executor.submit(_analyze_session, operations, gears_by_name, session)
-            for session in sessions
-        ]
+        futures = []
+        for session in sessions:
+            f = executor.submit(_analyze_session, operations, gears_by_name, session)
+            f.name = 'session {}'.format(session['_id'])
+            futures.append(f)
         _wait_for_futures(futures)
 
 
-def _session_status(expected_ops, session):
+def _session_status(operations, session):
     analyses = _get_analyses(session['_id'])
 
-    started_ops = {
-        a['label'] for a in analyses}
+    started_ops = set()
+    completed_ops = set()
+    expected_ops = set()
+
+    for op in operations:
+        a = find(analyses, label=op.label_matcher)
+        if a:
+            started_ops.add(op.label)
+            if a['job']['state'] == 'complete':
+                completed_ops.add(op.label)
+        expected_ops.add(op.label)
+
     if not started_ops:
         return 'not started'
 
-    completed_ops = {
-        a['label'] for a in analyses
-        if a['job']['state'] == 'complete'}
     if completed_ops == expected_ops:
         return 'complete'
     else:
-        return 'in progress'
+        return 'in progress ({} of {} done)'.format(
+            len(completed_ops), len(expected_ops))
 
 
 def status(operations, project=None, detail=False):
@@ -325,13 +380,12 @@ def status(operations, project=None, detail=False):
     detail - When true, some session IDs for each status are logged.
     '''
     sessions = request('projects/{}/sessions'.format(project['_id']))
-    expected_ops = {op.label for op in operations}
-    statuses = [(s, _session_status(expected_ops, s)) for s in sessions]
-    if detail:
-        result = {}
-        for sess, stat in statuses:
-            result.setdefault(stat, []).append(sess['_id'])
-        for stat, session_ids in result.iteritems():
-            print(stat, len(session_ids), 'some IDs:', session_ids[:4])
-    else:
-        print(Counter(stat for _, stat in statuses))
+    statuses = [(s, _session_status(operations, s)) for s in sessions]
+    result = {}
+    for sess, stat in statuses:
+        result.setdefault(stat, []).append(sess['_id'])
+    for stat, session_ids in sorted(result.iteritems()):
+        msg = []
+        if detail:
+            msg = ['some IDs:', session_ids[:4]]
+        print(len(session_ids), stat, *msg)
diff --git a/scitran_client/st_client.py b/scitran_client/st_client.py
@@ -278,12 +278,12 @@ def download_file(
         desc = tqdm_kwargs.pop('desc', file_name)
         leave = tqdm_kwargs.pop('leave', False)
         with open(abs_file_path, 'wb') as fd:
-            content = response.iter_content()
+            content = response.iter_content(4096)
             if not tqdm_disable:
                 content = tqdm(
-                    response.iter_content(),
+                    content,
                     desc=desc, leave=leave,
-                    unit_scale=True, unit='B',
+                    unit=' 4KB',
                     **tqdm_kwargs
                 )
             for chunk in content:
diff --git a/tests/test_flywheel_analyzer.py b/tests/test_flywheel_analyzer.py