Skip to content

Commit d1a8fed

Browse files
committed
More robust numShapes and offsets reading, faster shape index when missing shx file, more tests
1 parent 0c7d8b4 commit d1a8fed

File tree

2 files changed

+262
-50
lines changed

2 files changed

+262
-50
lines changed

shapefile.py

Lines changed: 112 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,21 +1118,36 @@ def __len__(self):
11181118
elif self.shp:
11191119
# Otherwise use shape count
11201120
if self.shx:
1121-
# Use index file to get total count
11221121
if self.numShapes is None:
1123-
# File length (16-bit word * 2 = bytes) - header length
1124-
self.shx.seek(24)
1125-
shxRecordLength = (unpack(">i", self.shx.read(4))[0] * 2) - 100
1126-
self.numShapes = shxRecordLength // 8
1127-
1122+
self.__shxHeader()
1123+
11281124
return self.numShapes
11291125

11301126
else:
11311127
# Index file not available, iterate all shapes to get total count
11321128
if self.numShapes is None:
1133-
for i,shape in enumerate(self.iterShapes()):
1134-
pass
1135-
self.numShapes = i + 1
1129+
# Determine length of shp file
1130+
shp = self.shp
1131+
checkpoint = shp.tell()
1132+
shp.seek(0,2)
1133+
shpLength = shp.tell()
1134+
shp.seek(100)
1135+
# Do a fast shape iteration until end of file.
1136+
unpack = Struct('>2i').unpack
1137+
offsets = []
1138+
pos = shp.tell()
1139+
while pos < shpLength:
1140+
offsets.append(pos)
1141+
# Unpack the shape header only
1142+
(recNum, recLength) = unpack(shp.read(8))
1143+
# Jump to next shape position
1144+
pos += 8 + (2 * recLength)
1145+
shp.seek(pos)
1146+
# Set numShapes and offset indices
1147+
self.numShapes = len(offsets)
1148+
self._offsets = offsets
1149+
# Return to previous file position
1150+
shp.seek(checkpoint)
11361151

11371152
return self.numShapes
11381153

@@ -1172,6 +1187,8 @@ def load(self, shapefile=None):
11721187
self.__shpHeader()
11731188
if self.dbf:
11741189
self.__dbfHeader()
1190+
if self.shx:
1191+
self.__shxHeader()
11751192

11761193
def load_shp(self, shapefile_name):
11771194
"""
@@ -1251,7 +1268,7 @@ def __restrictIndex(self, i):
12511268
return i
12521269

12531270
def __shpHeader(self):
1254-
"""Reads the header information from a .shp or .shx file."""
1271+
"""Reads the header information from a .shp file."""
12551272
if not self.shp:
12561273
raise ShapefileException("Shapefile Reader requires a shapefile or file-like object. (no shp file found")
12571274
shp = self.shp
@@ -1353,27 +1370,40 @@ def __shape(self, oid=None, bbox=None):
13531370
f.seek(next)
13541371
return record
13551372

1373+
def __shxHeader(self):
1374+
"""Reads the header information from a .shx file."""
1375+
shx = self.shx
1376+
if not shx:
1377+
raise ShapefileException("Shapefile Reader requires a shapefile or file-like object. (no shx file found")
1378+
# File length (16-bit word * 2 = bytes) - header length
1379+
shx.seek(24)
1380+
shxRecordLength = (unpack(">i", shx.read(4))[0] * 2) - 100
1381+
self.numShapes = shxRecordLength // 8
1382+
1383+
def __shxOffsets(self):
1384+
'''Reads the shape offset positions from a .shx file'''
1385+
shx = self.shx
1386+
if not shx:
1387+
raise ShapefileException("Shapefile Reader requires a shapefile or file-like object. (no shx file found")
1388+
# Jump to the first record.
1389+
shx.seek(100)
1390+
# Each index record consists of two nrs, we only want the first one
1391+
shxRecords = _Array('i', shx.read(2 * self.numShapes * 4) )
1392+
if sys.byteorder != 'big':
1393+
shxRecords.byteswap()
1394+
self._offsets = [2 * el for el in shxRecords[::2]]
1395+
13561396
def __shapeIndex(self, i=None):
13571397
"""Returns the offset in a .shp file for a shape based on information
13581398
in the .shx index file."""
13591399
shx = self.shx
1360-
if not shx:
1400+
# Return None if no shx or no index requested
1401+
if not shx or i == None:
13611402
return None
1403+
# At this point, we know the shx file exists
13621404
if not self._offsets:
1363-
if self.numShapes is None:
1364-
# File length (16-bit word * 2 = bytes) - header length
1365-
shx.seek(24)
1366-
shxRecordLength = (unpack(">i", shx.read(4))[0] * 2) - 100
1367-
self.numShapes = shxRecordLength // 8
1368-
# Jump to the first record.
1369-
shx.seek(100)
1370-
# Each index record consists of two nrs, we only want the first one
1371-
shxRecords = _Array('i', shx.read(2 * self.numShapes * 4) )
1372-
if sys.byteorder != 'big':
1373-
shxRecords.byteswap()
1374-
self._offsets = [2 * el for el in shxRecords[::2]]
1375-
if not i == None:
1376-
return self._offsets[i]
1405+
self.__shxOffsets()
1406+
return self._offsets[i]
13771407

13781408
def shape(self, i=0, bbox=None):
13791409
"""Returns a shape object for a shape in the geometry
@@ -1385,10 +1415,30 @@ def shape(self, i=0, bbox=None):
13851415
i = self.__restrictIndex(i)
13861416
offset = self.__shapeIndex(i)
13871417
if not offset:
1388-
# Shx index not available so iterate the full list.
1389-
for j,k in enumerate(self.iterShapes()):
1390-
if j == i:
1391-
return k
1418+
# Shx index not available.
1419+
# Determine length of shp file
1420+
shp.seek(0,2)
1421+
shpLength = shp.tell()
1422+
shp.seek(100)
1423+
# Do a fast shape iteration until the requested index or end of file.
1424+
unpack = Struct('>2i').unpack
1425+
_i = 0
1426+
offset = shp.tell()
1427+
while offset < shpLength:
1428+
if _i == i:
1429+
# Reached the requested index, exit loop with the offset value
1430+
break
1431+
# Unpack the shape header only
1432+
(recNum, recLength) = unpack(shp.read(8))
1433+
# Jump to next shape position
1434+
offset += 8 + (2 * recLength)
1435+
shp.seek(offset)
1436+
_i += 1
1437+
# If the index was not found, it likely means the .shp file is incomplete
1438+
if _i != i:
1439+
raise ShapefileException('Shape index {} is out of bounds; the .shp file only contains {} shapes'.format(i, _i))
1440+
1441+
# Seek to the offset and read the shape
13921442
shp.seek(offset)
13931443
return self.__shape(oid=i, bbox=bbox)
13941444

@@ -1397,21 +1447,8 @@ def shapes(self, bbox=None):
13971447
To only read shapes within a given spatial region, specify the 'bbox'
13981448
arg as a list or tuple of xmin,ymin,xmax,ymax.
13991449
"""
1400-
shp = self.__getFileObj(self.shp)
1401-
# Found shapefiles which report incorrect
1402-
# shp file length in the header. Can't trust
1403-
# that so we seek to the end of the file
1404-
# and figure it out.
1405-
shp.seek(0,2)
1406-
self.shpLength = shp.tell()
1407-
shp.seek(100)
14081450
shapes = Shapes()
1409-
i = 0
1410-
while shp.tell() < self.shpLength:
1411-
shape = self.__shape(oid=i, bbox=bbox)
1412-
if shape:
1413-
shapes.append(shape)
1414-
i += 1
1451+
shapes.extend(self.iterShapes(bbox=bbox))
14151452
return shapes
14161453

14171454
def iterShapes(self, bbox=None):
@@ -1421,15 +1458,40 @@ def iterShapes(self, bbox=None):
14211458
arg as a list or tuple of xmin,ymin,xmax,ymax.
14221459
"""
14231460
shp = self.__getFileObj(self.shp)
1461+
# Found shapefiles which report incorrect
1462+
# shp file length in the header. Can't trust
1463+
# that so we seek to the end of the file
1464+
# and figure it out.
14241465
shp.seek(0,2)
1425-
self.shpLength = shp.tell()
1466+
shpLength = shp.tell()
14261467
shp.seek(100)
1427-
i = 0
1428-
while shp.tell() < self.shpLength:
1429-
shape = self.__shape(oid=i, bbox=bbox)
1430-
if shape:
1431-
yield shape
1432-
i += 1
1468+
1469+
if self.numShapes:
1470+
# Iterate exactly the number of shapes from shx header
1471+
for i in xrange(self.numShapes):
1472+
# MAYBE: check if more left of file or exit early?
1473+
shape = self.__shape(oid=i, bbox=bbox)
1474+
if shape:
1475+
yield shape
1476+
else:
1477+
# No shx file, unknown nr of shapes
1478+
# Instead iterate until reach end of file
1479+
# Collect the offset indices during iteration
1480+
i = 0
1481+
offsets = []
1482+
pos = shp.tell()
1483+
while pos < shpLength:
1484+
offsets.append(pos)
1485+
shape = self.__shape(oid=i, bbox=bbox)
1486+
pos = shp.tell()
1487+
if shape:
1488+
yield shape
1489+
i += 1
1490+
# Entire shp file consumed
1491+
# Update the number of shapes and list of offsets
1492+
assert i == len(offsets)
1493+
self.numShapes = i
1494+
self._offsets = offsets
14331495

14341496
def __dbfHeader(self):
14351497
"""Reads a dbf header. Xbase-related code borrows heavily from ActiveState Python Cookbook Recipe 362715 by Raymond Hettinger"""

test_shapefile.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,156 @@ def test_shape_oid():
616616
assert shaperec.shape.oid == i
617617

618618

619+
def test_shape_oid_no_shx():
620+
"""
621+
Assert that the shape's oid attribute returns
622+
its index in the shapefile, when shx file is missing.
623+
"""
624+
basename = "shapefiles/blockgroups"
625+
shp = open(basename + ".shp", 'rb')
626+
dbf = open(basename + ".dbf", 'rb')
627+
with shapefile.Reader(shp=shp, dbf=dbf) as sf, \
628+
shapefile.Reader(basename) as sf_expected:
629+
for i in range(len(sf)):
630+
shape = sf.shape(i)
631+
assert shape.oid == i
632+
shape_expected = sf_expected.shape(i)
633+
assert shape.__geo_interface__ == shape_expected.__geo_interface__
634+
635+
for i,shape in enumerate(sf.shapes()):
636+
assert shape.oid == i
637+
shape_expected = sf_expected.shape(i)
638+
assert shape.__geo_interface__ == shape_expected.__geo_interface__
639+
640+
for i,shape in enumerate(sf.iterShapes()):
641+
assert shape.oid == i
642+
shape_expected = sf_expected.shape(i)
643+
assert shape.__geo_interface__ == shape_expected.__geo_interface__
644+
645+
for i,shaperec in enumerate(sf.iterShapeRecords()):
646+
assert shaperec.shape.oid == i
647+
shape_expected = sf_expected.shape(i)
648+
assert shaperec.shape.__geo_interface__ == shape_expected.__geo_interface__
649+
650+
651+
def test_reader_offsets():
652+
"""
653+
Assert that reader will not read the shx offsets unless necessary,
654+
i.e. requesting a shape index.
655+
"""
656+
basename = "shapefiles/blockgroups"
657+
with shapefile.Reader(basename) as sf:
658+
# shx offsets should not be read during loading
659+
assert not sf._offsets
660+
# reading a shape index should trigger reading offsets from shx file
661+
shape = sf.shape(3)
662+
assert len(sf._offsets) == len(sf.shapes())
663+
664+
665+
def test_reader_offsets_no_shx():
666+
"""
667+
Assert that reading a shapefile without a shx file will not build
668+
the offsets unless necessary, i.e. reading all the shapes.
669+
"""
670+
basename = "shapefiles/blockgroups"
671+
shp = open(basename + ".shp", 'rb')
672+
dbf = open(basename + ".dbf", 'rb')
673+
with shapefile.Reader(shp=shp, dbf=dbf) as sf:
674+
# offsets should not be built during loading
675+
assert not sf._offsets
676+
# reading a shape index should iterate to the shape
677+
# but the list of offsets should remain empty
678+
shape = sf.shape(3)
679+
assert not sf._offsets
680+
# reading all the shapes should build the list of offsets
681+
shapes = sf.shapes()
682+
assert len(sf._offsets) == len(shapes)
683+
684+
685+
686+
def test_reader_numshapes():
687+
"""
688+
Assert that reader reads the numShapes attribute from the
689+
shx file header during loading.
690+
"""
691+
basename = "shapefiles/blockgroups"
692+
with shapefile.Reader(basename) as sf:
693+
# numShapes should be set during loading
694+
assert sf.numShapes != None
695+
# numShapes should equal the number of shapes
696+
assert sf.numShapes == len(sf.shapes())
697+
698+
699+
def test_reader_numshapes_no_shx():
700+
"""
701+
Assert that reading a shapefile without a shx file will have
702+
an unknown value for the numShapes attribute (None), and that
703+
reading all the shapes will set the numShapes attribute.
704+
"""
705+
basename = "shapefiles/blockgroups"
706+
shp = open(basename + ".shp", 'rb')
707+
dbf = open(basename + ".dbf", 'rb')
708+
with shapefile.Reader(shp=shp, dbf=dbf) as sf:
709+
# numShapes should be unknown due to missing shx file
710+
assert sf.numShapes == None
711+
# numShapes should be set after reading all the shapes
712+
shapes = sf.shapes()
713+
assert sf.numShapes == len(shapes)
714+
715+
716+
def test_reader_len():
717+
"""
718+
Assert that calling len() on reader is equal to length of
719+
all shapes and records.
720+
"""
721+
basename = "shapefiles/blockgroups"
722+
with shapefile.Reader(basename) as sf:
723+
assert len(sf) == len(sf.records()) == len(sf.shapes())
724+
725+
726+
def test_reader_len_not_loaded():
727+
"""
728+
Assert that calling len() on reader that hasn't loaded a shapefile
729+
yet is equal to 0.
730+
"""
731+
with shapefile.Reader() as sf:
732+
assert len(sf) == 0
733+
734+
735+
def test_reader_len_dbf_only():
736+
"""
737+
Assert that calling len() on reader when reading a dbf file only,
738+
is equal to length of all records.
739+
"""
740+
basename = "shapefiles/blockgroups"
741+
dbf = open(basename + ".dbf", 'rb')
742+
with shapefile.Reader(dbf=dbf) as sf:
743+
assert len(sf) == len(sf.records())
744+
745+
746+
def test_reader_len_no_dbf():
747+
"""
748+
Assert that calling len() on reader when dbf file is missing,
749+
is equal to length of all shapes.
750+
"""
751+
basename = "shapefiles/blockgroups"
752+
shp = open(basename + ".shp", 'rb')
753+
shx = open(basename + ".shx", 'rb')
754+
with shapefile.Reader(shp=shp, shx=shx) as sf:
755+
assert len(sf) == len(sf.shapes())
756+
757+
758+
def test_reader_len_no_dbf_shx():
759+
"""
760+
Assert that calling len() on reader when dbf and shx file is missing,
761+
is equal to length of all shapes.
762+
"""
763+
basename = "shapefiles/blockgroups"
764+
shp = open(basename + ".shp", 'rb')
765+
with shapefile.Reader(shp=shp) as sf:
766+
assert len(sf) == len(sf.shapes())
767+
768+
619769
def test_bboxfilter_shape():
620770
"""
621771
Assert that applying the bbox filter to shape() correctly ignores the shape

0 commit comments

Comments
 (0)