diff --git a/README.md b/README.md index 52b8de78..a0abc6a1 100644 --- a/README.md +++ b/README.md @@ -458,11 +458,13 @@ shapeType Point do not have a bounding box 'bbox'. ... if not name.startswith('_'): ... name 'bbox' + 'from_byte_stream' 'oid' 'parts' 'points' 'shapeType' 'shapeTypeName' + 'write_to_byte_stream' * `oid`: The shape's index position in the original shapefile. diff --git a/pyproject.toml b/pyproject.toml index a8e14c4a..d3e0e894 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,9 +112,11 @@ load-plugins=[ "pylint_per_file_ignores", ] -# Silence warning: shapefile.py:2076:20: W0212: Access to a protected -# member _from_geojson of a client class (protected-access) -# shapefile.py:950:16: W0201: Attribute 'm' defined outside __init__ (attribute-defined-outside-init) +# Silence warnings: src/shapefile.py:2076:20: W0212: Access to a protected member _from_geojson of a client class (protected-access) +# src/shapefile.py:950:16: W0201: Attribute 'm' defined outside __init__ (attribute-defined-outside-init) +# src/shapefile.py:973:12: W0707: Consider explicitly re-raising using 'except error as exc' and +# 'raise ShapefileException(f'Failed to write bounding box for record {i}. +# Expected floats.') from exc' (raise-missing-from) # Silence remarks: # src\shapefile.py:338:0: R0914: Too many local variables (21/15) (too-many-locals) # src\shapefile.py:338:0: R0912: Too many branches (24/12) (too-many-branches) @@ -134,6 +136,6 @@ load-plugins=[ # https://github.com/christopherpickering/pylint-per-file-ignores/issues/160 [tool.pylint.'messages control'] per-file-ignores = [ - "/src/shapefile.py:W0212,W0201,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0917,R1732", + "/src/shapefile.py:W0707,W0212,W0201,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0917,R1732", "test_shapefile.py:W0212,R1732", ] diff --git a/run_benchmarks.py b/run_benchmarks.py index 0b8b8288..edc2119a 100644 --- a/run_benchmarks.py +++ b/run_benchmarks.py @@ -2,14 +2,16 @@ from __future__ import annotations +import collections import functools import os import timeit from collections.abc import Callable from pathlib import Path +from tempfile import TemporaryFile as TempF from typing import Union -import shapefile as shp +import shapefile # For shapefiles from https://github.com/JamesParrott/PyShp_test_shapefile DEFAULT_PYSHP_TEST_REPO = ( @@ -31,26 +33,41 @@ def benchmark( name: str, run_count: int, func: Callable, - col_width: tuple, + col_widths: tuple, compare_to: float | None = None, ) -> float: placeholder = "Running..." - print(f"{name:>{col_width[0]}} | {placeholder}", end="", flush=True) + print(f"{name:>{col_widths[0]}} | {placeholder}", end="", flush=True) time_taken = timeit.timeit(func, number=run_count) print("\b" * len(placeholder), end="") time_suffix = " s" - print(f"{time_taken:{col_width[1]-len(time_suffix)}.3g}{time_suffix}", end="") + print(f"{time_taken:{col_widths[1]-len(time_suffix)}.3g}{time_suffix}", end="") print() return time_taken +fields = {} +shapeRecords = collections.defaultdict(list) + + def open_shapefile_with_PyShp(target: Union[str, os.PathLike]): - with shp.Reader(target) as r: + with shapefile.Reader(target) as r: + fields[target] = r.fields for shapeRecord in r.iterShapeRecords(): - pass + shapeRecords[target].append(shapeRecord) + + +def write_shapefile_with_PyShp(target: Union[str, os.PathLike]): + with TempF("wb") as shp, TempF("wb") as dbf, TempF("wb") as shx: + with shapefile.Writer(shp=shp, dbf=dbf, shx=shx) as w: # type: ignore [arg-type] + for field_info_tuple in fields[target]: + w.field(*field_info_tuple) + for shapeRecord in shapeRecords[target]: + w.shape(shapeRecord.shape) + w.record(*shapeRecord.record) -READER_TESTS = { +SHAPEFILES = { "Blockgroups": blockgroups_file, "Edit": edit_file, "Merge": merge_file, @@ -60,24 +77,47 @@ def open_shapefile_with_PyShp(target: Union[str, os.PathLike]): } -def run(run_count: int) -> None: - col_width = (21, 10) +# Load files to avoid one off delays that only affect first disk seek +for file_path in SHAPEFILES.values(): + file_path.read_bytes() + +reader_benchmarks = [ + functools.partial( + benchmark, + name=f"Read {test_name}", + func=functools.partial(open_shapefile_with_PyShp, target=target), + ) + for test_name, target in SHAPEFILES.items() +] + +# Require fields and shapeRecords to first have been populated +# from data from previouly running the reader_benchmarks +writer_benchmarks = [ + functools.partial( + benchmark, + name=f"Write {test_name}", + func=functools.partial(write_shapefile_with_PyShp, target=target), + ) + for test_name, target in SHAPEFILES.items() +] + + +def run(run_count: int, benchmarks: list[Callable[[], None]]) -> None: + col_widths = (22, 10) col_head = ("parser", "exec time", "performance (more is better)") - # Load files to avoid one off delays that only affect first disk seek - for file_path in READER_TESTS.values(): - file_path.read_bytes() print(f"Running benchmarks {run_count} times:") - print("-" * col_width[0] + "---" + "-" * col_width[1]) - print(f"{col_head[0]:>{col_width[0]}} | {col_head[1]:>{col_width[1]}}") - print("-" * col_width[0] + "-+-" + "-" * col_width[1]) - for test_name, target in READER_TESTS.items(): - benchmark( - f"Read {test_name}", - run_count, - functools.partial(open_shapefile_with_PyShp, target=target), - col_width, + print("-" * col_widths[0] + "---" + "-" * col_widths[1]) + print(f"{col_head[0]:>{col_widths[0]}} | {col_head[1]:>{col_widths[1]}}") + print("-" * col_widths[0] + "-+-" + "-" * col_widths[1]) + for benchmark in benchmarks: + benchmark( # type: ignore [call-arg] + run_count=run_count, + col_widths=col_widths, ) if __name__ == "__main__": - run(1) + print("Reader tests:") + run(1, reader_benchmarks) # type: ignore [arg-type] + print("\n\nWriter tests:") + run(1, writer_benchmarks) # type: ignore [arg-type] diff --git a/shapefiles/test/balancing.dbf b/shapefiles/test/balancing.dbf index c77d63b3..8272cf33 100644 Binary files a/shapefiles/test/balancing.dbf and b/shapefiles/test/balancing.dbf differ diff --git a/shapefiles/test/contextwriter.dbf b/shapefiles/test/contextwriter.dbf index e030c2a3..327fd493 100644 Binary files a/shapefiles/test/contextwriter.dbf and b/shapefiles/test/contextwriter.dbf differ diff --git a/shapefiles/test/corrupt_too_long.dbf b/shapefiles/test/corrupt_too_long.dbf index 57230c5d..e1bb1a55 100644 Binary files a/shapefiles/test/corrupt_too_long.dbf and b/shapefiles/test/corrupt_too_long.dbf differ diff --git a/shapefiles/test/dtype.dbf b/shapefiles/test/dtype.dbf index 1ddda01f..2939da47 100644 Binary files a/shapefiles/test/dtype.dbf and b/shapefiles/test/dtype.dbf differ diff --git a/shapefiles/test/line.dbf b/shapefiles/test/line.dbf index 24f529e8..9e43d68b 100644 Binary files a/shapefiles/test/line.dbf and b/shapefiles/test/line.dbf differ diff --git a/shapefiles/test/multipoint.dbf b/shapefiles/test/multipoint.dbf index 4d7d4f17..74ed8b14 100644 Binary files a/shapefiles/test/multipoint.dbf and b/shapefiles/test/multipoint.dbf differ diff --git a/shapefiles/test/onlydbf.dbf b/shapefiles/test/onlydbf.dbf index e030c2a3..327fd493 100644 Binary files a/shapefiles/test/onlydbf.dbf and b/shapefiles/test/onlydbf.dbf differ diff --git a/shapefiles/test/point.dbf b/shapefiles/test/point.dbf index e29d0859..5a881b87 100644 Binary files a/shapefiles/test/point.dbf and b/shapefiles/test/point.dbf differ diff --git a/shapefiles/test/polygon.dbf b/shapefiles/test/polygon.dbf index b116dc46..1cc8920a 100644 Binary files a/shapefiles/test/polygon.dbf and b/shapefiles/test/polygon.dbf differ diff --git a/shapefiles/test/shapetype.dbf b/shapefiles/test/shapetype.dbf index e030c2a3..327fd493 100644 Binary files a/shapefiles/test/shapetype.dbf and b/shapefiles/test/shapetype.dbf differ diff --git a/shapefiles/test/testfile.dbf b/shapefiles/test/testfile.dbf index e030c2a3..327fd493 100644 Binary files a/shapefiles/test/testfile.dbf and b/shapefiles/test/testfile.dbf differ diff --git a/src/py.typed b/src/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/shapefile.py b/src/shapefile.py index 321b215e..b5e6ed4c 100644 --- a/src/shapefile.py +++ b/src/shapefile.py @@ -623,6 +623,7 @@ def __init__( list of shapes. For MultiPatch geometry, partTypes designates the patch type of each of the parts. """ + # Preserve previous behaviour for anyone who set self.shapeType = None if not isinstance(shapeType, _NoShapeTypeSentinel): self.shapeType = shapeType self.points = points or [] @@ -844,153 +845,255 @@ def shapeTypeName(self) -> str: def __repr__(self): return f"Shape #{self.__oid}: {self.shapeTypeName}" - # pylint: disable=unused-argument - def _set_bbox_from_shp_file(self, f): + +class NullShape(Shape): + # Shape.shapeType = NULL already, + # to preserve handling of default args in Shape.__init__ + # Repeated for clarity. + shapeType = NULL + + @classmethod + def from_byte_stream(cls, b_io, next_shape, oid=None, bbox=None): # pylint: disable=unused-argument + # Shape.__init__ sets self.points = points or [] + return cls(oid=oid) + + @staticmethod + def write_to_byte_stream(b_io, s, i, bbox, mbox, zbox): # pylint: disable=unused-argument pass + +class _CanHaveBBox(Shape): + """As well as setting bounding boxes, we also utilize the + fact that this mixin applies to all the shapes that are + not a single point. + """ + + _shapeTypes = frozenset( + [ + POLYLINE, + POLYLINEM, + POLYLINEZ, + POLYGON, + POLYGONM, + POLYGONZ, + MULTIPOINT, + MULTIPOINTM, + MULTIPOINTZ, + MULTIPATCH, + ] + ) + + # Not a BBox because the legacy implementation was a list, not a 4-tuple. + bbox: Optional[Sequence[float]] = None + + def _set_bbox_from_byte_stream(self, b_io): + self.bbox = _Array[float]("d", unpack("<4d", b_io.read(32))) + @staticmethod - def _get_nparts_from_shp_file(f): - return None + def _write_bbox_to_byte_stream(b_io, i, bbox): + try: + b_io.write(pack("<4d", *bbox)) + except error: + raise ShapefileException( + f"Failed to write bounding box for record {i}. Expected floats." + ) @staticmethod - def _get_npoints_from_shp_file(f): - return None + def _get_npoints_from_byte_stream(b_io): + return unpack("= 16: - __mmin, __mmax = unpack("<2d", f.read(16)) + def _set_ms_from_byte_stream(self, b_io, nPoints, next_shape): + if next_shape - b_io.tell() >= 16: + __mmin, __mmax = unpack("<2d", b_io.read(16)) # Measure values less than -10e38 are nodata values according to the spec - if next_shape - f.tell() >= nPoints * 8: + if next_shape - b_io.tell() >= nPoints * 8: self.m = [] - for m in _Array[float]("d", unpack(f"<{nPoints}d", f.read(nPoints * 8))): + for m in _Array[float]("d", unpack(f"<{nPoints}d", b_io.read(nPoints * 8))): if m > NODATA: self.m.append(m) else: @@ -1022,31 +1137,105 @@ def _set_m_from_shp_file(self, f, nPoints, next_shape): else: self.m = [None for _ in range(nPoints)] + @staticmethod + def _write_ms_to_byte_stream(b_io, s, i, mbox): + # Write m extremes and values + # When reading a file, pyshp converts NODATA m values to None, so here we make sure to convert them back to NODATA + # Note: missing m values are autoset to NODATA. + try: + b_io.write(pack("<2d", *mbox)) + except error: + raise ShapefileException( + f"Failed to write measure extremes for record {i}. Expected floats" + ) + try: + if hasattr(s, "m"): + # if m values are stored in attribute + b_io.write( + pack( + f"<{len(s.m)}d", *[m if m is not None else NODATA for m in s.m] + ) + ) + else: + # if m values are stored as 3rd/4th dimension + # 0-index position of m value is 3 if z type (x,y,z,m), or 2 if m type (x,y,m) + mpos = 3 if s.shapeType in _HasZ._shapeTypes else 2 + for p in s.points: + b_io.write( + pack( + " mpos and p[mpos] is not None + else NODATA, + ) + ) + except error: + raise ShapefileException( + f"Failed to write measure values for record {i}. Expected floats" + ) + -class _HasZ(Shape): +class _HasZ(_CanHaveBBox): + # Not a Point + _shapeTypes = frozenset( + [ + POLYLINEZ, + POLYGONZ, + MULTIPOINTZ, + MULTIPATCH, + ] + ) z: Sequence[float] - def _set_z_from_shp_file(self, f, nPoints): - __zmin, __zmax = unpack("<2d", f.read(16)) # pylint: disable=unused-private-member - self.z = _Array[float]("d", unpack(f"<{nPoints}d", f.read(nPoints * 8))) + def _set_zs_from_byte_stream(self, b_io, nPoints): + __zmin, __zmax = unpack("<2d", b_io.read(16)) # pylint: disable=unused-private-member + self.z = _Array[float]("d", unpack(f"<{nPoints}d", b_io.read(nPoints * 8))) + + @staticmethod + def _write_zs_to_byte_stream(b_io, s, i, zbox): + # Write z extremes and values + # Note: missing z values are autoset to 0, but not sure if this is ideal. + try: + b_io.write(pack("<2d", *zbox)) + except error: + raise ShapefileException( + f"Failed to write elevation extremes for record {i}. Expected floats." + ) + try: + if hasattr(s, "z"): + # if z values are stored in attribute + b_io.write(pack(f"<{len(s.z)}d", *s.z)) + else: + # if z values are stored as 3rd dimension + for p in s.points: + b_io.write(pack(" 2 else 0)) + except error: + raise ShapefileException( + f"Failed to write elevation values for record {i}. Expected floats." + ) class MultiPatch(_HasM, _HasZ, _CanHaveParts): shapeType = MULTIPATCH - def _set_part_types_from_shp_file(self, f, nParts): - self.partTypes = _Array[int]("i", unpack(f"<{nParts}i", f.read(nParts * 4))) + def _set_part_types_from_byte_stream(self, b_io, nParts): + self.partTypes = _Array[int]("i", unpack(f"<{nParts}i", b_io.read(nParts * 4))) + @staticmethod + def _write_part_types_to_byte_stream(b_io, s): + for partType in s.partTypes: + b_io.write(pack("= 8: - (m,) = unpack("= 8: + (m,) = unpack(" RecordValue: and IndexError, if the field exists but the field's corresponding value in the Record does not exist """ - # pylint: disable=raise-missing-from try: if item == "__setstate__": # Prevent infinite loop from copy.deepcopy() raise AttributeError("_Record does not implement __setstate__") @@ -1166,7 +1426,6 @@ def __getattr__(self, item: str) -> RecordValue: raise IndexError( f"{item} found as a field but not enough values available." ) - # pylint: enable=raise-missing-from def __setattr__(self, key: str, value: RecordValue): """ @@ -1182,7 +1441,7 @@ def __setattr__(self, key: str, value: RecordValue): index = self.__field_positions[key] return list.__setitem__(self, index, value) except KeyError: - raise AttributeError(f"{key} is not a field name") # pylint: disable=raise-missing-from + raise AttributeError(f"{key} is not a field name") def __getitem__(self, item): """ @@ -1221,7 +1480,7 @@ def __setitem__(self, key, value): if index is not None: return list.__setitem__(self, index, value) - raise IndexError(f"{key} is not a field name and not an int") # pylint: disable=raise-missing-from + raise IndexError(f"{key} is not a field name and not an int") @property def oid(self) -> int: @@ -1802,7 +2061,24 @@ def __shape( f = self.__getFileObj(self.shp) - shape = _read_shape_from_shp_file(f, oid, bbox) + # shape = Shape(oid=oid) + (__recNum, recLength) = unpack_2_int32_be(f.read(8)) + # Determine the start of the next record + + # Convert from num of 16 bit words, to 8 bit bytes + recLength_bytes = 2 * recLength + + next_shape = f.tell() + recLength_bytes + + shapeType = unpack(" str: @@ -2556,7 +2830,6 @@ def __shapefileHeader( Several of the shapefile formats are so similar that a single generic method to read or write them is warranted.""" - # pylint: disable=raise-missing-from f = self.__getFileObj(fileObj) f.seek(0) # File code, Unused bytes @@ -2615,8 +2888,6 @@ def __shapefileHeader( "Failed to write shapefile elevation and measure values. Floats required." ) - # pylint: enable=raise-missing-from - def __dbfHeader(self): """Writes the dbf header and field descriptors.""" f = self.__getFileObj(self.dbf) @@ -2688,7 +2959,6 @@ def shape( self.__shxRecord(offset, length) def __shpRecord(self, s): - # pylint: disable=raise-missing-from f = self.__getFileObj(self.shp) offset = f.tell() # Record number, Content length place holder @@ -2703,180 +2973,45 @@ def __shpRecord(self, s): f"The shape's type ({s.shapeType}) must match " f"the type of the shapefile ({self.shapeType})." ) - f.write(pack(" 2 else 0)) - except error: - raise ShapefileException( - f"Failed to write elevation values for record {self.shpNum}. Expected floats." - ) - # Write m extremes and values - # When reading a file, pyshp converts NODATA m values to None, so here we make sure to convert them back to NODATA - # Note: missing m values are autoset to NODATA. - if s.shapeType in (13, 15, 18, 23, 25, 28, 31): - try: - f.write(pack("<2d", *self.__mbox(s))) - except error: - raise ShapefileException( - f"Failed to write measure extremes for record {self.shpNum}. Expected floats" - ) - try: - if hasattr(s, "m"): - # if m values are stored in attribute - # fmt: off - f.write( - pack( - f"<{len(s.m)}d", - *[m if m is not None else NODATA for m in s.m] - ) - ) - # fmt: on - else: - # if m values are stored as 3rd/4th dimension - # 0-index position of m value is 3 if z type (x,y,z,m), or 2 if m type (x,y,m) - mpos = 3 if s.shapeType in (13, 15, 18, 31) else 2 - for p in s.points: - f.write( - pack( - " mpos and p[mpos] is not None - else NODATA, - ) - ) - except error: - raise ShapefileException( - f"Failed to write measure values for record {self.shpNum}. Expected floats" - ) - # Write a single point - if s.shapeType in (1, 11, 21): - try: - f.write(pack("<2d", s.points[0][0], s.points[0][1])) - except error: - raise ShapefileException( - f"Failed to write point for record {self.shpNum}. Expected floats." - ) - # Write a single Z value - # Note: missing z values are autoset to 0, but not sure if this is ideal. - if s.shapeType == 11: - # update the global z box - self.__zbox(s) - # then write value - if hasattr(s, "z"): - # if z values are stored in attribute - try: - if not s.z: - s.z = (0,) - f.write(pack("i", length)) + f.seek(finish) - # pylint: enable=raise-missing-from + return offset, length def __shxRecord(self, offset, length): """Writes the shx records.""" - # pylint: disable=raise-missing-from f = self.__getFileObj(self.shx) try: f.write(pack(">i", offset // 2)) @@ -2886,8 +3021,6 @@ def __shxRecord(self, offset, length): ) f.write(pack(">i", length)) - # pylint: enable=raise-missing-from - def record( self, *recordList: Iterable[RecordValue], **recordDict: dict[str, RecordValue] ): @@ -3138,6 +3271,8 @@ def _shapeparts( # Make sure polygon rings (parts) are closed # if shapeType in (5, 15, 25, 31): + # This method is never actually called on a MultiPatch + # so we omit its shapeType (31) for efficiency if isinstance(polyShape, Polygon): for part in parts: if part[0] != part[-1]: diff --git a/test_shapefile.py b/test_shapefile.py index 04994af8..2a10d3ee 100644 --- a/test_shapefile.py +++ b/test_shapefile.py @@ -995,36 +995,31 @@ def test_iterRecords_start_stop(): # Arbitrary selection of record indices # (there are 663 records in blockgroups.dbf). - for i in [ + indices = [ 0, 1, 2, - 3, 5, 11, - 17, - 33, - 51, - 103, - 170, - 234, - 435, - 543, + 41, + 310, + 513, N - 3, - N - 2, N - 1, - ]: - for record in sf.iterRecords(start=i): + ] + for i, index in enumerate(indices): + for record in sf.iterRecords(start=index): assert record == sf.record(record.oid) - for record in sf.iterRecords(stop=i): + for record in sf.iterRecords(stop=index): assert record == sf.record(record.oid) - for stop in range(i, len(sf)): + for j in range(i + 1, len(indices)): + stop = indices[j] # test negative indexing from end, as well as # positive values of stop, and its default - for stop_arg in (stop, stop - len(sf)): - for record in sf.iterRecords(start=i, stop=stop_arg): + for stop_arg in (stop, stop - N): + for record in sf.iterRecords(start=index, stop=stop_arg): assert record == sf.record(record.oid)