Skip to content

Commit 3b690d0

Browse files
committed
changes in pandas/io/pytables.py
1. added __str__ (to do __repr__) 2. row removal in tables is much faster if rows are consecutive 3. added Term class, refactored Selection (this is backdwards compatible) Term is a concise way of specifying conditions for queries, e.g. Term(dict(field = 'index', op = '>', value = '20121114')) Term('index', '20121114') Term('index', '>', '20121114') Term('index', ['20121114','20121114']) Term('index', datetime(2012,11,14)) Term('index>20121114') updated tests for same this should close GH pandas-dev#1996
1 parent 25cc4e1 commit 3b690d0

File tree

2 files changed

+245
-105
lines changed

2 files changed

+245
-105
lines changed

pandas/io/pytables.py

Lines changed: 194 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from datetime import datetime, date
99
import time
10+
import re
1011

1112
import numpy as np
1213
from pandas import (
@@ -916,27 +917,40 @@ def _read_panel_table(self, group, where=None):
916917
lp = DataFrame(new_values, index=new_index, columns=lp.columns)
917918
wp = lp.to_panel()
918919

919-
if sel.column_filter:
920-
new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
920+
if sel.filter:
921+
new_minor = sorted(set(wp.minor_axis) & sel.filter)
921922
wp = wp.reindex(minor=new_minor)
922923
return wp
923924

924925

925-
def _delete_from_table(self, group, where = None):
926+
def _delete_from_table(self, group, where):
927+
""" delete rows from a group where condition is True """
926928
table = getattr(group, 'table')
927929

928930
# create the selection
929-
s = Selection(table, where, table._v_attrs.index_kind)
931+
s = Selection(table,where,table._v_attrs.index_kind)
930932
s.select_coords()
931933

932934
# delete the rows in reverse order
933-
l = list(s.values)
934-
l.reverse()
935-
for c in l:
936-
table.removeRows(c)
937-
self.handle.flush()
938-
return len(s.values)
935+
l = list(s.values)
936+
ln = len(l)
937+
938+
if ln:
939+
940+
# if we can do a consecutive removal - do it!
941+
if l[0]+ln-1 == l[-1]:
942+
table.removeRows(start = l[0], stop = l[-1]+1)
939943

944+
# one by one
945+
else:
946+
l.reverse()
947+
for c in l:
948+
table.removeRows(c)
949+
950+
self.handle.flush()
951+
952+
# return the number of rows removed
953+
return ln
940954

941955
def _convert_index(index):
942956
if isinstance(index, DatetimeIndex):
@@ -1088,87 +1102,203 @@ def _alias_to_class(alias):
10881102
return _reverse_index_map.get(alias, Index)
10891103

10901104

1105+
class Term(object):
1106+
""" create a term object that holds a field, op, and value
1107+
1108+
Parameters
1109+
----------
1110+
field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel)
1111+
op : a valid op (defaults to '=') (optional)
1112+
>, >=, <, <=, =, != (not equal) are allowed
1113+
value : a value or list of values (required)
1114+
1115+
Returns
1116+
-------
1117+
a Term object
1118+
1119+
Examples
1120+
--------
1121+
Term(dict(field = 'index', op = '>', value = '20121114'))
1122+
Term('index', '20121114')
1123+
Term('index', '>', '20121114')
1124+
Term('index', ['20121114','20121114'])
1125+
Term('index', datetime(2012,11,14))
1126+
Term('index>20121114')
1127+
1128+
"""
1129+
1130+
_ops = ['<','<=','>','>=','=','!=']
1131+
_search = re.compile("^(?P<field>\w+)(?P<op>%s)(?P<value>.+)$" % '|'.join(_ops))
1132+
_index = ['index','major_axis']
1133+
_column = ['column','minor_axis','items']
1134+
1135+
def __init__(self, field, op = None, value = None, index_kind = None):
1136+
self.field = None
1137+
self.op = None
1138+
self.value = None
1139+
self.typ = None
1140+
self.index_kind = index_kind
1141+
self.filter = None
1142+
self.condition = None
1143+
1144+
# unpack lists/tuples in field
1145+
if isinstance(field,(tuple,list)):
1146+
f = field
1147+
field = f[0]
1148+
if len(f) > 1:
1149+
op = f[1]
1150+
if len(f) > 2:
1151+
value = f[2]
1152+
1153+
# backwards compatible
1154+
if isinstance(field, dict):
1155+
self.field = field.get('field')
1156+
self.op = field.get('op') or '='
1157+
self.value = field.get('value')
1158+
1159+
# passed a term
1160+
elif isinstance(field,Term):
1161+
self.field = field.field
1162+
self.op = field.op
1163+
self.value = field.value
1164+
1165+
# a string expression (or just the field)
1166+
elif isinstance(field,basestring):
1167+
1168+
# is a term is passed
1169+
s = self._search.match(field)
1170+
if s is not None:
1171+
self.field = s.group('field')
1172+
self.op = s.group('op')
1173+
self.value = s.group('value')
1174+
1175+
else:
1176+
self.field = field
1177+
1178+
# is an op passed?
1179+
if isinstance(op, basestring) and op in self._ops:
1180+
self.op = op
1181+
self.value = value
1182+
else:
1183+
self.op = '='
1184+
self.value = op
1185+
1186+
else:
1187+
raise Exception("Term does not understand the supplied field [%s]" % field)
1188+
1189+
# we have valid fields
1190+
if self.field is None or self.op is None or self.value is None:
1191+
raise Exception("Could not create this term [%s]" % str(self))
1192+
1193+
# valid field name
1194+
if self.field in self._index:
1195+
self.typ = 'index'
1196+
elif self.field in self._column:
1197+
self.typ = 'column'
1198+
else:
1199+
raise Exception("field is not a valid index/column for this term [%s]" % str(self))
1200+
1201+
# we have valid conditions
1202+
if self.op in ['>','>=','<','<=']:
1203+
if hasattr(self.value,'__iter__') and len(self.value) > 1:
1204+
raise Exception("an inequality condition cannot have multiple values [%s]" % str(self))
1205+
1206+
if not hasattr(self.value,'__iter__'):
1207+
self.value = [ self.value ]
1208+
1209+
self.eval()
1210+
1211+
def __str__(self):
1212+
return "typ->%s,field->%s,op->%s,value->%s" % (self.typ,self.field,self.op,self.value)
1213+
1214+
__repr__ = __str__
1215+
1216+
def eval(self):
1217+
""" set the numexpr expression for this term """
1218+
1219+
# convert values
1220+
values = [ self.convert_value(v) for v in self.value ]
1221+
1222+
# equality conditions
1223+
if self.op in ['=','!=']:
1224+
1225+
# too many values to create the expression?
1226+
if len(values) <= 61:
1227+
self.condition = "(%s)" % ' | '.join([ "(%s == %s)" % (self.field,v[0]) for v in values])
1228+
1229+
# use a filter after reading
1230+
else:
1231+
self.filter = set([ v[1] for v in values ])
1232+
1233+
else:
1234+
1235+
self.condition = '(%s %s %s)' % (self.field, self.op, values[0][0])
1236+
1237+
def convert_value(self, v):
1238+
1239+
if self.typ == 'index':
1240+
if self.index_kind == 'datetime64' :
1241+
return [lib.Timestamp(v).value, None]
1242+
elif isinstance(v, datetime):
1243+
return [time.mktime(v.timetuple()), None]
1244+
elif not isinstance(v, basestring):
1245+
return [str(v), None]
1246+
1247+
# string quoting
1248+
return ["'" + v + "'", v]
1249+
10911250
class Selection(object):
10921251
"""
10931252
Carries out a selection operation on a tables.Table object.
10941253
10951254
Parameters
10961255
----------
10971256
table : tables.Table
1098-
where : list of dicts of the following form
1099-
1100-
Comparison op
1101-
{'field' : 'index',
1102-
'op' : '>=',
1103-
'value' : value}
1104-
1105-
Match single value
1106-
{'field' : 'index',
1107-
'value' : v1}
1257+
where : list of Terms (or convertable to)
11081258
1109-
Match a set of values
1110-
{'field' : 'index',
1111-
'value' : [v1, v2, v3]}
11121259
"""
11131260
def __init__(self, table, where=None, index_kind=None):
1114-
self.table = table
1115-
self.where = where
1261+
self.table = table
1262+
self.where = where
11161263
self.index_kind = index_kind
1117-
self.column_filter = None
1118-
self.the_condition = None
1119-
self.conditions = []
1120-
self.values = None
1121-
if where:
1122-
self.generate(where)
1264+
self.values = None
1265+
self.condition = None
1266+
self.filter = None
1267+
self.terms = self.generate(where)
1268+
1269+
# create the numexpr & the filter
1270+
if self.terms:
1271+
conds = [ t.condition for t in self.terms if t.condition is not None ]
1272+
if len(conds):
1273+
self.condition = "(%s)" % ' & '.join(conds)
1274+
self.filter = set()
1275+
for t in self.terms:
1276+
if t.filter is not None:
1277+
self.filter |= t.filter
11231278

11241279
def generate(self, where):
1125-
# and condictions
1126-
for c in where:
1127-
op = c.get('op', None)
1128-
value = c['value']
1129-
field = c['field']
1130-
1131-
if field == 'index' and self.index_kind == 'datetime64':
1132-
val = lib.Timestamp(value).value
1133-
self.conditions.append('(%s %s %s)' % (field, op, val))
1134-
elif field == 'index' and isinstance(value, datetime):
1135-
value = time.mktime(value.timetuple())
1136-
self.conditions.append('(%s %s %s)' % (field, op, value))
1137-
else:
1138-
self.generate_multiple_conditions(op, value, field)
1280+
""" generate and return the terms """
1281+
if where is None: return None
11391282

1140-
if len(self.conditions):
1141-
self.the_condition = '(' + ' & '.join(self.conditions) + ')'
1283+
if not isinstance(where, (list,tuple)):
1284+
where = [ where ]
11421285

1143-
def generate_multiple_conditions(self, op, value, field):
1144-
1145-
if op and op == 'in' or isinstance(value, (list, np.ndarray)):
1146-
if len(value) <= 61:
1147-
l = '(' + ' | '.join([ "(%s == '%s')" % (field, v)
1148-
for v in value]) + ')'
1149-
self.conditions.append(l)
1150-
else:
1151-
self.column_filter = set(value)
1152-
else:
1153-
if op is None:
1154-
op = '=='
1155-
self.conditions.append('(%s %s "%s")' % (field, op, value))
1286+
return [ Term(c, index_kind = self.index_kind) for c in where ]
11561287

11571288
def select(self):
11581289
"""
11591290
generate the selection
11601291
"""
1161-
if self.the_condition:
1162-
self.values = self.table.readWhere(self.the_condition)
1163-
1292+
if self.condition is not None:
1293+
self.values = self.table.readWhere(self.condition)
11641294
else:
11651295
self.values = self.table.read()
11661296

11671297
def select_coords(self):
11681298
"""
11691299
generate the selection
11701300
"""
1171-
self.values = self.table.getWhereList(self.the_condition)
1301+
self.values = self.table.getWhereList(self.condition)
11721302

11731303

11741304
def _get_index_factory(klass):

0 commit comments

Comments
 (0)