@@ -2313,111 +2313,34 @@ def set_kind(self):
23132313 if self .typ is None :
23142314 self .typ = getattr (self .description , self .cname , None )
23152315
2316- def set_atom (
2317- self ,
2318- block ,
2319- existing_col ,
2320- min_itemsize ,
2321- nan_rep ,
2322- info ,
2323- encoding = None ,
2324- errors = "strict" ,
2325- ):
2316+ def set_atom (self , block , itemsize : int , data_converted , use_str : bool ):
23262317 """ create and setup my atom from the block b """
23272318
23282319 # short-cut certain block types
23292320 if block .is_categorical :
23302321 self .set_atom_categorical (block )
2331- self .update_info (info )
2332- return
23332322 elif block .is_datetimetz :
23342323 self .set_atom_datetime64tz (block )
2335- self .update_info (info )
2336- return
23372324 elif block .is_datetime :
2338- return self .set_atom_datetime64 (block )
2325+ self .set_atom_datetime64 (block )
23392326 elif block .is_timedelta :
2340- return self .set_atom_timedelta64 (block )
2327+ self .set_atom_timedelta64 (block )
23412328 elif block .is_complex :
2342- return self .set_atom_complex (block )
2343-
2344- dtype = block .dtype .name
2345- inferred_type = lib .infer_dtype (block .values , skipna = False )
2329+ self .set_atom_complex (block )
23462330
2347- if inferred_type == "date" :
2348- raise TypeError ("[date] is not implemented as a table column" )
2349- elif inferred_type == "datetime" :
2350- # after GH#8260
2351- # this only would be hit for a multi-timezone dtype
2352- # which is an error
2353-
2354- raise TypeError (
2355- "too many timezones in this block, create separate data columns"
2356- )
2357- elif inferred_type == "unicode" :
2358- raise TypeError ("[unicode] is not implemented as a table column" )
2359-
2360- # this is basically a catchall; if say a datetime64 has nans then will
2361- # end up here ###
2362- elif inferred_type == "string" or dtype == "object" :
2363- self .set_atom_string (
2364- block , existing_col , min_itemsize , nan_rep , encoding , errors ,
2365- )
2366-
2367- # set as a data block
2331+ elif use_str :
2332+ self .set_atom_string (itemsize , data_converted )
23682333 else :
2334+ # set as a data block
23692335 self .set_atom_data (block )
23702336
2371- def get_atom_string (self , block , itemsize ):
2372- return _tables ().StringCol (itemsize = itemsize , shape = block .shape [0 ])
2373-
2374- def set_atom_string (
2375- self , block , existing_col , min_itemsize , nan_rep , encoding , errors
2376- ):
2377- # fill nan items with myself, don't disturb the blocks by
2378- # trying to downcast
2379- block = block .fillna (nan_rep , downcast = False )
2380- if isinstance (block , list ):
2381- block = block [0 ]
2382- data = block .values
2383-
2384- # see if we have a valid string type
2385- inferred_type = lib .infer_dtype (data .ravel (), skipna = False )
2386- if inferred_type != "string" :
2387-
2388- # we cannot serialize this data, so report an exception on a column
2389- # by column basis
2390- for i in range (len (block .shape [0 ])):
2391-
2392- col = block .iget (i )
2393- inferred_type = lib .infer_dtype (col .ravel (), skipna = False )
2394- if inferred_type != "string" :
2395- iloc = block .mgr_locs .indexer [i ]
2396- raise TypeError (
2397- f"Cannot serialize the column [{ iloc } ] because\n "
2398- f"its data contents are [{ inferred_type } ] object dtype"
2399- )
2400-
2401- # itemsize is the maximum length of a string (along any dimension)
2402- data_converted = _convert_string_array (data , encoding , errors )
2403- itemsize = data_converted .itemsize
2404-
2405- # specified min_itemsize?
2406- if isinstance (min_itemsize , dict ):
2407- min_itemsize = int (
2408- min_itemsize .get (self .name ) or min_itemsize .get ("values" ) or 0
2409- )
2410- itemsize = max (min_itemsize or 0 , itemsize )
2411-
2412- # check for column in the values conflicts
2413- if existing_col is not None :
2414- eci = existing_col .validate_col (itemsize )
2415- if eci > itemsize :
2416- itemsize = eci
2337+ def get_atom_string (self , shape , itemsize ):
2338+ return _tables ().StringCol (itemsize = itemsize , shape = shape [0 ])
24172339
2340+ def set_atom_string (self , itemsize : int , data_converted : np .ndarray ):
24182341 self .itemsize = itemsize
24192342 self .kind = "string"
2420- self .typ = self .get_atom_string (block , itemsize )
2343+ self .typ = self .get_atom_string (data_converted . shape , itemsize )
24212344 self .set_data (data_converted .astype (f"|S{ itemsize } " , copy = False ))
24222345
24232346 def get_atom_coltype (self , kind = None ):
@@ -2621,7 +2544,7 @@ def validate_names(self):
26212544 # TODO: should the message here be more specifically non-str?
26222545 raise ValueError ("cannot have non-object label DataIndexableCol" )
26232546
2624- def get_atom_string (self , block , itemsize ):
2547+ def get_atom_string (self , shape , itemsize ):
26252548 return _tables ().StringCol (itemsize = itemsize )
26262549
26272550 def get_atom_data (self , block , kind = None ):
@@ -3972,17 +3895,26 @@ def get_blk_items(mgr, blocks):
39723895 else :
39733896 existing_col = None
39743897
3975- col = klass . create_for_block ( i = i , name = name , version = self . version )
3976- col . values = list ( b_items )
3977- col . set_atom (
3978- block = b ,
3898+ new_name = name or f"values_block_ { i } "
3899+ itemsize , data_converted , use_str = _maybe_convert_for_string_atom (
3900+ new_name ,
3901+ b ,
39793902 existing_col = existing_col ,
39803903 min_itemsize = min_itemsize ,
39813904 nan_rep = nan_rep ,
39823905 encoding = self .encoding ,
39833906 errors = self .errors ,
3984- info = self .info ,
39853907 )
3908+
3909+ col = klass .create_for_block (i = i , name = new_name , version = self .version )
3910+ col .values = list (b_items )
3911+ col .set_atom (
3912+ block = b ,
3913+ itemsize = itemsize ,
3914+ data_converted = data_converted ,
3915+ use_str = use_str ,
3916+ )
3917+ col .update_info (self .info )
39863918 col .set_pos (j )
39873919
39883920 vaxes .append (col )
@@ -4847,6 +4779,74 @@ def _unconvert_index(data, kind: str, encoding=None, errors="strict"):
48474779 return index
48484780
48494781
4782+ def _maybe_convert_for_string_atom (
4783+ name : str , block , existing_col , min_itemsize , nan_rep , encoding , errors
4784+ ):
4785+ use_str = False
4786+
4787+ if not block .is_object :
4788+ return block .dtype .itemsize , block .values , use_str
4789+
4790+ dtype_name = block .dtype .name
4791+ inferred_type = lib .infer_dtype (block .values , skipna = False )
4792+
4793+ if inferred_type == "date" :
4794+ raise TypeError ("[date] is not implemented as a table column" )
4795+ elif inferred_type == "datetime" :
4796+ # after GH#8260
4797+ # this only would be hit for a multi-timezone dtype which is an error
4798+ raise TypeError (
4799+ "too many timezones in this block, create separate data columns"
4800+ )
4801+
4802+ elif not (inferred_type == "string" or dtype_name == "object" ):
4803+ return block .dtype .itemsize , block .values , use_str
4804+
4805+ use_str = True
4806+
4807+ block = block .fillna (nan_rep , downcast = False )
4808+ if isinstance (block , list ):
4809+ # Note: because block is always object dtype, fillna goes
4810+ # through a path such that the result is always a 1-element list
4811+ block = block [0 ]
4812+ data = block .values
4813+
4814+ # see if we have a valid string type
4815+ inferred_type = lib .infer_dtype (data .ravel (), skipna = False )
4816+ if inferred_type != "string" :
4817+
4818+ # we cannot serialize this data, so report an exception on a column
4819+ # by column basis
4820+ for i in range (len (block .shape [0 ])):
4821+
4822+ col = block .iget (i )
4823+ inferred_type = lib .infer_dtype (col .ravel (), skipna = False )
4824+ if inferred_type != "string" :
4825+ iloc = block .mgr_locs .indexer [i ]
4826+ raise TypeError (
4827+ f"Cannot serialize the column [{ iloc } ] because\n "
4828+ f"its data contents are [{ inferred_type } ] object dtype"
4829+ )
4830+
4831+ # itemsize is the maximum length of a string (along any dimension)
4832+ data_converted = _convert_string_array (data , encoding , errors ).reshape (data .shape )
4833+ assert data_converted .shape == block .shape , (data_converted .shape , block .shape )
4834+ itemsize = data_converted .itemsize
4835+
4836+ # specified min_itemsize?
4837+ if isinstance (min_itemsize , dict ):
4838+ min_itemsize = int (min_itemsize .get (name ) or min_itemsize .get ("values" ) or 0 )
4839+ itemsize = max (min_itemsize or 0 , itemsize )
4840+
4841+ # check for column in the values conflicts
4842+ if existing_col is not None :
4843+ eci = existing_col .validate_col (itemsize )
4844+ if eci > itemsize :
4845+ itemsize = eci
4846+
4847+ return itemsize , data_converted , use_str
4848+
4849+
48504850def _convert_string_array (data , encoding , errors , itemsize = None ):
48514851 """
48524852 we take a string-like that is object dtype and coerce to a fixed size
0 commit comments