diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 932ae8b1a33d0..efe61716d0831 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -7,9 +7,8 @@ import warnings from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from libc.stdio cimport fopen, fclose -from libc.stdlib cimport malloc, free -from libc.string cimport strncpy, strlen, strcmp, strcasecmp +from libc.stdlib cimport free +from libc.string cimport strncpy, strlen, strcasecmp cimport cython from cython cimport Py_ssize_t @@ -27,9 +26,6 @@ cdef extern from "Python.h": object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding, char *errors) -cdef extern from "stdlib.h": - void memcpy(void *dst, void *src, size_t n) - import numpy as np cimport numpy as cnp @@ -50,7 +46,7 @@ from khash cimport ( import pandas.compat as compat from pandas.core.dtypes.common import ( - is_categorical_dtype, CategoricalDtype, + is_categorical_dtype, is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, @@ -90,9 +86,6 @@ try: except NameError: basestring = str -cdef extern from "src/numpy_helper.h": - void transfer_object_column(char *dst, char *src, size_t stride, - size_t length) cdef extern from "parser/tokenizer.h": @@ -232,8 +225,6 @@ cdef extern from "parser/tokenizer.h": int parser_trim_buffers(parser_t *self) - void debug_print_parser(parser_t *self) - int tokenize_all_rows(parser_t *self) nogil int tokenize_nrows(parser_t *self, size_t nrows) nogil @@ -249,7 +240,6 @@ cdef extern from "parser/tokenizer.h": double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil - int to_longlong(char *item, long long *p_value) nogil int to_boolean(const char *item, uint8_t *val) nogil @@ -875,9 +865,6 @@ cdef class TextReader: return header, field_count - cdef _implicit_index_count(self): - pass - def read(self, rows=None): """ rows=None --> read all rows @@ -997,9 +984,6 @@ cdef class TextReader: return columns - def debug_print(self): - debug_print_parser(self.parser) - cdef _start_clock(self): self.clocks.append(time.time()) @@ -1346,6 +1330,7 @@ cdef class TextReader: else: return None + cdef object _true_values = [b'True', b'TRUE', b'true'] cdef object _false_values = [b'False', b'FALSE', b'false'] @@ -1375,21 +1360,6 @@ cdef asbytes(object o): _NA_VALUES = _ensure_encoded(list(com._NA_VALUES)) -def _is_file_like(obj): - if PY3: - import io - if isinstance(obj, io.TextIOWrapper): - raise ParserError('Cannot handle open unicode files (yet)') - - # BufferedReader is a byte reader for Python 3 - file = io.BufferedReader - else: - import __builtin__ - file = __builtin__.file - - return isinstance(obj, (basestring, file)) - - def _maybe_upcast(arr): """ @@ -1479,6 +1449,7 @@ cdef _string_box_factorize(parser_t *parser, int64_t col, return result, na_count + cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): @@ -1532,6 +1503,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col, return result, na_count + cdef _string_box_decode(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, @@ -1662,6 +1634,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col, kh_destroy_str(table) return np.asarray(codes), result, na_count + cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: @@ -1679,6 +1652,7 @@ cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, return result + cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, size_t width, char *data) nogil: @@ -1694,10 +1668,12 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col, strncpy(data, word, width) data += width + cdef char* cinf = b'inf' cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' + cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset, object na_flist): @@ -1738,6 +1714,7 @@ cdef _try_double(parser_t *parser, int64_t col, return None, None return result, na_count + cdef inline int _try_double_nogil(parser_t *parser, double (*double_converter)( const char *, char **, char, @@ -1808,6 +1785,7 @@ cdef inline int _try_double_nogil(parser_t *parser, return 0 + cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): @@ -1843,6 +1821,7 @@ cdef _try_uint64(parser_t *parser, int64_t col, return result + cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, @@ -1881,6 +1860,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, return 0 + cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_t *na_hashset): @@ -1909,6 +1889,7 @@ cdef _try_int64(parser_t *parser, int64_t col, return result, na_count + cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, @@ -1948,69 +1929,6 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, return 0 -cdef _try_bool(parser_t *parser, int64_t col, - int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset): - cdef: - int na_count - Py_ssize_t lines = line_end - line_start - uint8_t *data - cnp.ndarray[cnp.uint8_t, ndim=1] result - - uint8_t NA = na_values[np.bool_] - - result = np.empty(lines) - data = result.data - - with nogil: - error = _try_bool_nogil(parser, col, line_start, - line_end, na_filter, - na_hashset, NA, data, - &na_count) - if error != 0: - return None, None - return result.view(np.bool_), na_count - -cdef inline int _try_bool_nogil(parser_t *parser, int64_t col, - int64_t line_start, - int64_t line_end, bint na_filter, - const kh_str_t *na_hashset, uint8_t NA, - uint8_t *data, int *na_count) nogil: - cdef: - int error - Py_ssize_t i, lines = line_end - line_start - coliter_t it - const char *word = NULL - khiter_t k - na_count[0] = 0 - - coliter_setup(&it, parser, col, line_start) - - if na_filter: - for i in range(lines): - COLITER_NEXT(it, word) - - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: - na_count[0] += 1 - data[0] = NA - data += 1 - continue - - error = to_boolean(word, data) - if error != 0: - return error - data += 1 - else: - for i in range(lines): - COLITER_NEXT(it, word) - - error = to_boolean(word, data) - if error != 0: - return error - data += 1 - return 0 cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, @@ -2039,6 +1957,7 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, return None, None return result.view(np.bool_), na_count + cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, @@ -2131,6 +2050,7 @@ cdef kh_str_t* kset_from_list(list values) except NULL: return table + cdef kh_float64_t* kset_float64_from_list(values) except NULL: # caller takes responsibility for freeing the hash table cdef: diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index de3486eca3e9b..6c2029fff8a1a 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -75,22 +75,6 @@ PANDAS_INLINE PyObject* char_to_string(char* data) { #endif } -void transfer_object_column(char* dst, char* src, size_t stride, - size_t length) { - size_t i; - size_t sz = sizeof(PyObject*); - - for (i = 0; i < length; ++i) { - // uninitialized data - - // Py_XDECREF(*((PyObject**) dst)); - - memcpy(dst, src, sz); - Py_INCREF(*((PyObject**)dst)); - src += sz; - dst += stride; - } -} void set_array_not_contiguous(PyArrayObject* ao) { ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); diff --git a/pandas/_libs/src/parser/.gitignore b/pandas/_libs/src/parser/.gitignore deleted file mode 100644 index f07e771a35eec..0000000000000 --- a/pandas/_libs/src/parser/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -!*.c -test* \ No newline at end of file diff --git a/pandas/_libs/src/parser/Makefile b/pandas/_libs/src/parser/Makefile deleted file mode 100644 index ec88eaf44ba15..0000000000000 --- a/pandas/_libs/src/parser/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -PYTHONBASE = /Library/Frameworks/EPD64.framework/Versions/Current -NUMPY_INC = /Library/Frameworks/EPD64.framework/Versions/7.1/lib/python2.7/site-packages/numpy/core/include -PYTHON_INC = -I$(PYTHONBASE)/include/python2.7 -I$(NUMPY_INC) -PYTHON_LINK = -L$(PYTHONBASE)/lib -lpython - -SOURCES = conversions.c parser.c str_to.c - -check-syntax: - gcc -g $(PYTHON_INC) -o /dev/null -S ${CHK_SOURCES} - -test: $(SOURCES) - gcc $(PYTHON_INC) -o test $(SOURCES) - ./test \ No newline at end of file diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2e4ade209fa38..6e8c220eab6b8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1317,21 +1317,6 @@ int parser_trim_buffers(parser_t *self) { return 0; } -void debug_print_parser(parser_t *self) { - int64_t j, line; - char *token; - - for (line = 0; line < self->lines; ++line) { - printf("(Parsed) Line %lld: ", (long long)line); - - for (j = 0; j < self->line_fields[j]; ++j) { - token = self->words[j + self->line_start[line]]; - printf("%s ", token); - } - printf("\n"); - } -} - /* nrows : number of rows to tokenize (or until reach EOF) all : tokenize all the data vs. certain number of rows diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 9462608a26814..63baf91e3c136 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -247,8 +247,6 @@ void parser_del(parser_t *self); void parser_set_default_options(parser_t *self); -void debug_print_parser(parser_t *self); - int tokenize_nrows(parser_t *self, size_t nrows); int tokenize_all_rows(parser_t *self);