Skip to content

Bug querying with offset #547

@jacobg

Description

@jacobg

Sorry I don't have a good repro for you because I'm only repro-ing it on my data, but here is a code that seems to clearly demonstrate a bug:

import time

from google.cloud import datastore


def _create_query(ds_client: datastore.Client) -> datastore.Query:
    query = ds_client.query(kind='my-kind')
    query.add_filter(filter=datastore.query.PropertyFilter('active', '=', True))
    query.add_filter(filter=datastore.query.PropertyFilter('account', '=', 111))
    query.add_filter(filter=datastore.query.PropertyFilter('roles', '=', 'w'))
    query.add_filter(filter=datastore.query.PropertyFilter('labels', '=', 222))
    return query


def run():
    ds_client = datastore.Client(project='my-project')

    start = time.time()
    count = 0
    for entity in _create_query(ds_client).fetch():
        count += 1
    end = time.time()
    print(f'Fetched all {count} entities in {end - start} seconds')
    real_count = count

    for offset in range(0, real_count + 2):
        start = time.time()
        count = 0
        for entity in _create_query(ds_client).fetch(offset=offset):
            count += 1
        end = time.time()
        print(f'Fetched at offset={offset} {count} entities in {end - start} seconds')


if __name__ == '__main__':
    run()

In addition, I added two print statements in google/cloud/datastore/query.py's _next_page while loop:

        while (
            response_pb.batch.more_results == _NOT_FINISHED
            and response_pb.batch.skipped_results < query_pb.offset
        ):
            # We haven't finished processing. A likely reason is we haven't
            # skipped all of the results yet. Don't return any results.
            # Instead, rerun query, adjusting offsets. Datastore doesn't process
            # more than 1000 skipped results in a query.
            old_query_pb = query_pb
            query_pb = query_pb2.Query()
            query_pb._pb.CopyFrom(old_query_pb._pb)  # copy for testability
            query_pb.start_cursor = response_pb.batch.skipped_cursor
            query_pb.offset -= response_pb.batch.skipped_results
            print(f'response_pb.batch.skipped_cursor = {response_pb.batch.skipped_cursor}')
            print(f'response_pb.batch.skipped_results = {response_pb.batch.skipped_results}')

            request = {
                "project_id": self._query.project,
                "partition_id": partition_id,
                "read_options": read_options,
                "query": query_pb,
            }
            helpers.set_database_id_to_request(request, self.client.database)

            response_pb = self.client._datastore_api.run_query(
                request=request,
                **kwargs,
            )

This is the output running the script:

Fetched all 16 entities in 1.7923529148101807 seconds
Fetched at offset=0 16 entities in 1.6426568031311035 seconds
Fetched at offset=1 15 entities in 1.7998042106628418 seconds
Fetched at offset=2 14 entities in 1.525015115737915 seconds
Fetched at offset=3 13 entities in 1.4684479236602783 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=4 12 entities in 1.6105940341949463 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=5 11 entities in 1.4919788837432861 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=6 10 entities in 1.6434409618377686 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=7 9 entities in 1.6705551147460938 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=8 8 entities in 1.490036964416504 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=9 7 entities in 1.263127088546753 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=10 6 entities in 1.1788661479949951 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
Fetched at offset=11 5 entities in 1.3047678470611572 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
Fetched at offset=12 4 entities in 1.4790778160095215 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xb4\xa6\x81\xd3\x84\xc7\x88\t\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b''
response_pb.batch.skipped_results = 0
Fetched at offset=13 15 entities in 2.0598793029785156 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xb4\xa6\x81\xd3\x84\xc7\x88\t\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b''
response_pb.batch.skipped_results = 0
Fetched at offset=14 14 entities in 2.045603036880493 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xb4\xa6\x81\xd3\x84\xc7\x88\t\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b''
response_pb.batch.skipped_results = 0
Fetched at offset=15 13 entities in 2.3071401119232178 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xb4\xa6\x81\xd3\x84\xc7\x88\t\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b''
response_pb.batch.skipped_results = 0
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=16 12 entities in 2.1022279262542725 seconds
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xcd\x9e\x8d\xc1\xf7\xb4\xcf\x06\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\x9c\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xda\xc7\xf2\xf9\xab\xbe\xab\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\x80\x80\xf0\xde\xf1\xc5\xdc\x08\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xb4\xa6\x81\xd3\x84\xc7\x88\t\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 1
response_pb.batch.skipped_cursor = b''
response_pb.batch.skipped_results = 0
response_pb.batch.skipped_cursor = b'\n7\x121j\x0cs~my-projectr!\x0b\x12\x14my_kind\x18\xdb\x81\x83\x8c\xb5\x91\xe4\x03\x0c\x18\x00 \x00'
response_pb.batch.skipped_results = 3
Fetched at offset=17 11 entities in 1.9469151496887207 seconds

It seems what happens is that if the while loop that skips results based on offset reaches the end of the actual resultset, then the response returns empty cursor. In that case, what the code should do is realize there are no results and immediately return. However, what the code does instead is move on to run the full query again for one more iteration, then exit the while loop, and fetch and return all the remaining results.

Am I understanding that correctly?

Metadata

Metadata

Assignees

Labels

api: datastoreIssues related to the googleapis/python-datastore API.priority: p2Moderately-important priority. Fix may not be included in next release.type: bugError or flaw in code with unintended results or allowing sub-optimal usage patterns.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions