fixed bugs related to self.store vs store_select. Found a couple of date caching bugs as well. Rewrote config.py for YAML format.

quasiben · quasiben · commit 6d444eeb9197 · 2014-04-08T02:05:15.000-05:00
@hhuuggoo can you confirm that start=start_row, stop=end_row are unnecessary inf store_select statements now?
diff --git a/arraymanagement/nodes/sqlcaching.py b/arraymanagement/nodes/sqlcaching.py
@@ -369,12 +369,12 @@ def select(self, query_filter, where=None):
         if cache_info is None:
             self.cache_data(query_filter)
             cache_info = self.cache_info(query_filter)
+
         start_row, end_row = cache_info
-        if not where:
-            where = None
-            
-        result = store_select(self.store, self.localpath, where=where,
-                              start=start_row, stop=end_row)
+
+        #removed start_row, end_row
+        result = store_select(self.store, self.localpath, where=where)
+        #                      start=start_row, stop=end_row)
         return result
 
     def cache_query(self, query_filter):
@@ -403,11 +403,13 @@ def store_cache_spec(self, query_filter, start_row, end_row):
         write_pandas(self.store, 'cache_spec', data, {}, 1.1,
                      replace=False)
 
+
     def cache_info(self, query_filter):
         hashval = self.gethashval(query_filter)
         try:
-            result = store_select(self.store, 'cache_spec', 
-                                  where=[('hashval', hashval)])
+            #rewriting where statement for 0.13 pandas style
+            result = store_select(self.store, 'cache_spec',
+                                  where='hashval=="{}"'.format(hashval))
         except KeyError:
             return None
         if result is None:
@@ -446,14 +448,13 @@ def init_from_file(self):
                     setattr(self, name, column(name))
 
     def select(self, query_filter, where=None, **kwargs):
-
         ignore_cache = kwargs.get('IgnoreCache',None)
         if ignore_cache:
             query = self.compiled_query(query_filter,kwargs)
             return query
 
-
-        if 'date' not in kwargs.keys():
+        dateKeys = [k for k in kwargs.keys() if 'date' in k]
+        if not dateKeys:
             #no dates in query
 
             fs = FlexibleSqlCaching(self)
@@ -464,11 +465,9 @@ def select(self, query_filter, where=None, **kwargs):
             return result
 
         else:
-            dateKeys = [k for k in kwargs.keys() if 'date' in k]
             dateKeys = sorted(dateKeys)
             start_date, end_date = kwargs[dateKeys[0]], kwargs[dateKeys[1]]
 
-
             result = self.cache_info(query_filter,start_date, end_date)
 
             if result is None:
@@ -500,11 +499,11 @@ def store_cache_spec(self, query_filter, start_row, end_row, start_date, end_dat
     def cache_info(self, query_filter, start_date, end_date):
         hashval = self.gethashval(query_filter)
         try:
-
             # print self.store['/cache_spec']
-            result = store_select(self.store, 'cache_spec', 
-                                  where=[('hashval', hashval),
-                                         ('start_date',start_date)])
+            # result = store_select(self.store, 'cache_spec',
+            #                       where=[('hashval', hashval),
+            #                              ('start_date',start_date)])
+
             start_date = pd.Timestamp(start_date)
             end_date = pd.Timestamp(end_date)
 
@@ -562,7 +561,6 @@ def cache_data(self, query_params, start_date, end_date):
                 break;
 
         all_query = and_(query_params,column(col_date) >=start_date, column(col_date) <= end_date)
-
         q = self.cache_query(all_query)
         log.debug(str(q))
 
@@ -582,7 +580,6 @@ def cache_data(self, query_params, start_date, end_date):
             db_string_types=db_string_types,
             db_datetime_types=db_datetime_types
             )
-
         self.min_itemsize = min_itemsize
         self.finalize_min_itemsize()
         overrides = self.col_types
@@ -592,6 +589,7 @@ def cache_data(self, query_params, start_date, end_date):
             starting_row = self.table.nrows
         except AttributeError:
             starting_row = 0
+
         write_pandas_hdf_from_cursor(self.store, self.localpath, cur,
                                      columns, self.min_itemsize,
                                      dtype_overrides=overrides,
@@ -602,19 +600,17 @@ def cache_data(self, query_params, start_date, end_date):
             ending_row = self.table.nrows
         except AttributeError:
             ending_row = 0
-
         self.store_cache_spec(query_params, starting_row, ending_row, start_date, end_date)
 
 
     def munge_tables(self, hashval, start_date, end_date):
 
         store = self.store
-        store.select('cache_spec', where=[('hashval', hashval)])
+        # store.select('cache_spec', where=[('hashval', hashval)])
 
         store['/cache_spec'][['start_date','end_date']].sort(['start_date'])
-
-        df_min = store.select('cache_spec', where=[('start_date', '<=', start_date)]).reset_index()
-        df_max = store.select('cache_spec', where=[('end_date', '<=', end_date)]).reset_index()
+        df_min = store_select(store, 'cache_spec', where=[('start_date', '<=', start_date)]).reset_index()
+        df_max = store_select(store, 'cache_spec', where=[('end_date', '<=', end_date)]).reset_index()
 
         df_total = df_min.append(df_max)
         df_total.drop_duplicates('_end_row',inplace=True)
@@ -626,8 +622,7 @@ def munge_tables(self, hashval, start_date, end_date):
         for s in ss_vals:
             start_row = s[0]
             end_row = s[1]
-
-            temp = store.select(self.localpath,
+            temp = store_select(store, self.localpath,
                                            start=start_row, stop=end_row)
             temp.head()
 
diff --git a/example/datalib/config.py b/example/datalib/config.py
@@ -2,6 +2,8 @@
 from arraymanagement.nodes.csvnodes import PandasCSVNode
 from arraymanagement.nodes.hdfnodes import PandasHDFNode, PyTables
 from arraymanagement.nodes.sql import SimpleQueryTable
+from arraymanagement.nodes.sqlcaching import YamlSqlDateCaching
+
 
 global_config = dict(
     is_dataset = False,
@@ -14,6 +16,7 @@
         ('*.hdf5' , PandasHDFNode),
         ('*.h5' , PandasHDFNode),
         ('*.sql' , SimpleQueryTable),
+        ("*.yaml", YamlSqlDateCaching),
         ])
     )            
 
diff --git a/example/sqlviews/example_no_dates_not_entities.yaml b/example/sqlviews/example_no_dates_not_entities.yaml
@@ -1,7 +1,6 @@
 SQL:
     # Query for EOD data for list of entities
     eod_stock:
-        type: 'conditional'
         conditionals:
 
         query: >
diff --git a/tests/node_test.py b/tests/node_test.py
@@ -9,54 +9,54 @@
 
 from arraymanagement.client import ArrayClient
 
-# def setup_module():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     client.clear_disk_cache()
-#
-# def teardown_module():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     client.clear_disk_cache()
-#
-# def test_csv_node():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     node = client.get_node('/csvs/sample')
-#     data = node.get()
-#     #better check later
-#     assert data.shape == (73,2)
-#
-# def test_hdf_node():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     node = client.get_node('/pandashdf5/data')
-#     assert 'sample' in node.keys()
-#     node = node.get_node('sample')
-#     data = node.select()
-#     assert data.shape == (73,2)
-#
-# def test_custom_node():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     node = client.get_node('/custom/sample2')
-#     data1 = node.select()
-#     node = client.get_node('/custom/sample')
-#     data2 = node.get()
-#     assert data2.iloc[2]['values'] == 2
-#     assert data1.iloc[2]['values'] == 4
-#
-#
-# def test_csv_node():
-#     basepath = join(dirname(dirname(__file__)), 'example')
-#     client = ArrayClient(basepath)
-#     node = client.get_node('/customcsvs/sample')
-#     data1 = node.get()
-#     node = client.get_node('/customcsvs/sample2')
-#     data2 = node.select()
-#     node = client.get_node('/customcsvs/sample_pipe')
-#     data3 = node.select()
-#     #better check later
+def setup_module():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    client.clear_disk_cache()
+
+def teardown_module():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    client.clear_disk_cache()
+
+def test_csv_node():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    node = client.get_node('/csvs/sample')
+    data = node.get()
+    #better check later
+    assert data.shape == (73,2)
+
+def test_hdf_node():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    node = client.get_node('/pandashdf5/data')
+    assert 'sample' in node.keys()
+    node = node.get_node('sample')
+    data = node.select()
+    assert data.shape == (73,2)
+
+def test_custom_node():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    node = client.get_node('/custom/sample2')
+    data1 = node.select()
+    node = client.get_node('/custom/sample')
+    data2 = node.get()
+    assert data2.iloc[2]['values'] == 2
+    assert data1.iloc[2]['values'] == 4
+
+
+def test_csv_node():
+    basepath = join(dirname(dirname(__file__)), 'example')
+    client = ArrayClient(basepath)
+    node = client.get_node('/customcsvs/sample')
+    data1 = node.get()
+    node = client.get_node('/customcsvs/sample2')
+    data2 = node.select()
+    node = client.get_node('/customcsvs/sample_pipe')
+    data3 = node.select()
+    #better check later
 
 def test_sql_yaml_cache():
     basepath = join(dirname(dirname(__file__)), 'example')