Merge pull request #623 from mwaskom/enh/better_datagrabber

satra · satra · commit 07170aece0a7 · 2013-08-10T05:54:29.000-07:00
Improved alternate DataGrabber interface
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,7 @@
 Next release
 ============
 
+* ENH: SelectFiles: a streamlined version of DataGrabber
 * ENH: New interfaces: spm.ResliceToReference
 
 * FIX: Deals properly with 3d files in SPM Realign
diff --git a/nipype/__init__.py b/nipype/__init__.py
@@ -17,7 +17,7 @@
 
 from pipeline import Node, MapNode, Workflow
 from interfaces import (fsl, spm, freesurfer, afni, ants, slicer, dipy, nipy,
-                        mrtrix, camino, DataGrabber, DataSink,
+                        mrtrix, camino, DataGrabber, DataSink, SelectFiles,
                         IdentityInterface, Rename, Function, Select, Merge)
 
 
diff --git a/nipype/interfaces/__init__.py b/nipype/interfaces/__init__.py
@@ -7,6 +7,6 @@
 """
 __docformat__ = 'restructuredtext'
 
-from io import DataGrabber, DataSink
+from io import DataGrabber, DataSink, SelectFiles
 from utility import IdentityInterface, Rename, Function, Select, Merge
 import fsl, spm, freesurfer, afni, ants, slicer, dipy, nipy, mrtrix, camino
diff --git a/nipype/interfaces/io.py b/nipype/interfaces/io.py
@@ -18,7 +18,9 @@
 
 """
 import glob
+import string
 import os
+import os.path as op
 import shutil
 import re
 import tempfile
@@ -528,9 +530,9 @@ def _list_outputs(self):
                     filledtemplate = template
                     if argtuple:
                         try:
-                            filledtemplate = template%tuple(argtuple)
+                            filledtemplate = template % tuple(argtuple)
                         except TypeError as e:
-                            raise TypeError(e.message + ": Template %s failed to convert with args %s"%(template, str(tuple(argtuple))))
+                            raise TypeError(e.message + ": Template %s failed to convert with args %s" % (template, str(tuple(argtuple))))
                     outfiles = glob.glob(filledtemplate)
                     if len(outfiles) == 0:
                         msg = 'Output key: %s Template: %s returned no files' % (key, filledtemplate)
@@ -551,18 +553,146 @@ def _list_outputs(self):
                 outputs[key] = outputs[key][0]
         return outputs
 
-class DataFinderInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec): 
+
+class SelectFilesInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec):
+
+    base_directory = Directory(exists=True,
+        desc="Root path common to templates.")
+    sort_filelist = traits.Bool(True, usedefault=True,
+        desc="When matching mutliple files, return them in sorted order.")
+    raise_on_empty = traits.Bool(True, usedefault=True,
+        desc="Raise an exception if a template pattern matches no files.")
+    force_lists = traits.Bool(False, usedefault=True,
+        desc="Return all values as lists even when matching a single file.")
+
+
+class SelectFiles(IOBase):
+    """Flexibly collect data from disk to feed into workflows.
+
+    This interface uses the {}-based string formatting syntax to plug
+    values (possibly known only at workflow execution time) into string
+    templates and collect files from persistant storage. These templates
+    can also be combined with glob wildcards. The field names in the
+    formatting template (i.e. the terms in braces) will become inputs
+    fields on the interface, and the keys in the templates dictionary
+    will form the output fields.
+
+    Examples
+    --------
+
+    >>> from nipype import SelectFiles, Node
+    >>> templates={"T1": "{subject_id}/struct/T1.nii",
+    ...            "epi": "{subject_id}/func/f[0, 1].nii"}
+    >>> dg = Node(SelectFiles(templates), "selectfiles")
+    >>> dg.inputs.subject_id = "subj1"
+    >>> dg.outputs.get()
+    {'T1': <undefined>, 'epi': <undefined>}
+
+    The same thing with dynamic grabbing of specific files:
+
+    >>> templates["epi"] = "{subject_id}/func/f{run!s}.nii"
+    >>> dg = Node(SelectFiles(templates), "selectfiles")
+    >>> dg.inputs.subject_id = "subj1"
+    >>> dg.inputs.run = [2, 4]
+
+    """
+    input_spec = SelectFilesInputSpec
+    output_spec = DynamicTraitedSpec
+    _always_run = True
+
+    def __init__(self, templates, **kwargs):
+        """Create an instance with specific input fields.
+
+        Parameters
+        ----------
+        templates : dictionary
+            Mapping from string keys to string template values.
+            The keys become output fields on the interface.
+            The templates should use {}-formatting syntax, where
+            the names in curly braces become inputs fields on the interface.
+            Format strings can also use glob wildcards to match multiple
+            files. At runtime, the values of the interface inputs will be
+            plugged into these templates, and the resulting strings will be
+            used to select files.
+
+        """
+        super(SelectFiles, self).__init__(**kwargs)
+
+        # Infer the infields and outfields from the template
+        infields = []
+        for name, template in templates.iteritems():
+            for _, field_name, _, _ in string.Formatter().parse(template):
+                if field_name is not None and field_name not in infields:
+                    infields.append(field_name)
+
+        self._infields = infields
+        self._outfields = list(templates)
+        self._templates = templates
+
+        # Add the dynamic input fields
+        undefined_traits = {}
+        for field in infields:
+            self.inputs.add_trait(field, traits.Any)
+            undefined_traits[field] = Undefined
+        self.inputs.trait_set(trait_change_notify=False, **undefined_traits)
+
+    def _add_output_traits(self, base):
+        """Add the dynamic output fields"""
+        return add_traits(base, self._templates.keys())
+
+    def _list_outputs(self):
+        """Find the files and expose them as interface outputs."""
+        outputs = {}
+        info = dict([(k, v) for k, v in self.inputs.__dict__.items()
+                     if k in self._infields])
+
+        for field, template in self._templates.iteritems():
+
+            # Build the full template path
+            if isdefined(self.inputs.base_directory):
+                template = op.abspath(op.join(
+                    self.inputs.base_directory, template))
+            else:
+                template = op.abspath(template)
+
+            # Fill in the template and glob for files
+            filled_template = template.format(**info)
+            filelist = glob.glob(filled_template)
+
+            # Handle the case where nothing matched
+            if not filelist:
+                msg = "No files were found matching %s template: %s" % (
+                    field, template)
+                if self.inputs.raise_on_empty:
+                    raise IOError(msg)
+                else:
+                    warn(msg)
+
+            # Possibly sort the list
+            if self.inputs.sort_filelist:
+                filelist.sort()
+
+            # Handle whether this must be a list or not
+            if not self.inputs.force_lists:
+                filelist = list_to_filename(filelist)
+
+            outputs[field] = filelist
+
+        return outputs
+
+
+class DataFinderInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec):
     root_paths = traits.Either(traits.List(),
                                traits.Str(),
                                mandatory=True,)
-    match_regex = traits.Str('(.+)', 
+    match_regex = traits.Str('(.+)',
                              usedefault=True,
                              desc=("Regular expression for matching "
                              "paths."))
     ignore_regexes = traits.List(desc=("List of regular expressions, "
                                  "if any match the path it will be "
                                  "ignored.")
-                                )
+                                 )
     max_depth = traits.Int(desc="The maximum depth to search beneath "
                            "the root_paths")
     min_depth = traits.Int(desc="The minimum depth to search beneath "
@@ -573,23 +703,19 @@ class DataFinderInputSpec(DynamicTraitedSpec, BaseInterfaceInputSpec):
 
 
 class DataFinder(IOBase):
-    """Search for paths that match a given regular expression. Allows a less 
+    """Search for paths that match a given regular expression. Allows a less
     proscriptive approach to gathering input files compared to DataGrabber.
-    Will recursively search any subdirectories by default. This can be limited 
-    with the min/max depth options.     
-    
-    Matched paths are available in the output 'out_paths'. Any named groups of 
-    captured text from the regular expression are also available as ouputs of 
+    Will recursively search any subdirectories by default. This can be limited
+    with the min/max depth options.
+    Matched paths are available in the output 'out_paths'. Any named groups of
+    captured text from the regular expression are also available as ouputs of
     the same name.
-    
     Examples
     --------
 
     >>> from nipype.interfaces.io import DataFinder
-    
-    Look for Nifti files in directories with "ep2d_fid" or "qT1" in the name, 
+    Look for Nifti files in directories with "ep2d_fid" or "qT1" in the name,
     starting in the current directory.
-    
     >>> df = DataFinder()
     >>> df.inputs.root_paths = '.'
     >>> df.inputs.match_regex = '.+/(?P<series_dir>.+(qT1|ep2d_fid_T1).+)/(?P<basename>.+)\.nii.gz'
@@ -599,45 +725,39 @@ class DataFinder(IOBase):
      './018-ep2d_fid_T1_Gd2/acquisition.nii.gz',
      './016-ep2d_fid_T1_Gd1/acquisition.nii.gz',
      './013-ep2d_fid_T1_pre/acquisition.nii.gz']
-    
     >>> print result.outputs.series_dir # doctest: +SKIP
     ['027-ep2d_fid_T1_Gd4',
      '018-ep2d_fid_T1_Gd2',
      '016-ep2d_fid_T1_Gd1',
      '013-ep2d_fid_T1_pre']
-     
     >>> print result.outputs.basename # doctest: +SKIP
     ['acquisition',
      'acquisition',
      'acquisition',
      'acquisition']
 
     """
-    
     input_spec = DataFinderInputSpec
     output_spec = DynamicTraitedSpec
     _always_run = True
-    
+
     def _match_path(self, target_path):
         #Check if we should ignore the path
         for ignore_re in self.ignore_regexes:
             if ignore_re.search(target_path):
                 return
-                    
         #Check if we can match the path
         match = self.match_regex.search(target_path)
         if not match is None:
             match_dict = match.groupdict()
-            
             if self.result is None:
-                self.result = {'out_paths' : []}
+                self.result = {'out_paths': []}
                 for key in match_dict.keys():
                     self.result[key] = []
-                    
             self.result['out_paths'].append(target_path)
             for key, val in match_dict.iteritems():
                 self.result[key].append(val)
-    
+
     def _run_interface(self, runtime):
         #Prepare some of the inputs
         if isinstance(self.inputs.root_paths, str):
@@ -655,56 +775,49 @@ def _run_interface(self, runtime):
             self.ignore_regexes = []
         else:
             self.ignore_regexes = \
-                [re.compile(regex) 
+                [re.compile(regex)
                  for regex in self.inputs.ignore_regexes]
-            
         self.result = None
         for root_path in self.inputs.root_paths:
             #Handle tilda/env variables and remove extra seperators
             root_path = os.path.normpath(os.path.expandvars(os.path.expanduser(root_path)))
-            
             #Check if the root_path is a file
             if os.path.isfile(root_path):
                 if min_depth == 0:
                     self._match_path(root_path)
                 continue
-                
-            #Walk through directory structure checking paths 
+            #Walk through directory structure checking paths
             for curr_dir, sub_dirs, files in os.walk(root_path):
                 #Determine the current depth from the root_path
-                curr_depth = (curr_dir.count(os.sep) - 
+                curr_depth = (curr_dir.count(os.sep) -
                               root_path.count(os.sep))
-                
-                #If the max path depth has been reached, clear sub_dirs 
+                #If the max path depth has been reached, clear sub_dirs
                 #and files
-                if (not max_depth is None and 
-                    curr_depth >= max_depth):
+                if not max_depth is not None and curr_depth >= max_depth:
                     sub_dirs[:] = []
                     files = []
-                    
                 #Test the path for the curr_dir and all files
                 if curr_depth >= min_depth:
                     self._match_path(curr_dir)
                 if curr_depth >= (min_depth - 1):
                     for infile in files:
                         full_path = os.path.join(curr_dir, infile)
                         self._match_path(full_path)
-            
-        if (self.inputs.unpack_single and 
+        if (self.inputs.unpack_single and
             len(self.result['out_paths']) == 1
-           ):
+            ):
             for key, vals in self.result.iteritems():
                 self.result[key] = vals[0]
-            
         if not self.result:
             raise RuntimeError("Regular expression did not match any files!")
         return runtime
-            
+
     def _list_outputs(self):
         outputs = self._outputs().get()
         outputs.update(self.result)
         return outputs
 
+
 class FSSourceInputSpec(BaseInterfaceInputSpec):
     subjects_dir = Directory(mandatory=True,
                              desc='Freesurfer subjects directory.')
diff --git a/nipype/interfaces/tests/test_io.py b/nipype/interfaces/tests/test_io.py