Skip to content

Commit 19f9bd6

Browse files
committed
docs
1 parent ce6f889 commit 19f9bd6

File tree

1 file changed

+17
-75
lines changed

1 file changed

+17
-75
lines changed

tools/make_image_hf_dataset.ipynb

Lines changed: 17 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -28,38 +28,15 @@
2828
},
2929
{
3030
"cell_type": "code",
31-
"execution_count": 45,
31+
"execution_count": null,
3232
"metadata": {
3333
"vscode": {
3434
"languageId": "bat"
3535
}
3636
},
37-
"outputs": [
38-
{
39-
"name": "stdout",
40-
"output_type": "stream",
41-
"text": [
42-
"--2024-06-19 14:09:51-- https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip\n",
43-
"Resolving huggingface.co (huggingface.co)... 13.33.30.114, 13.33.30.49, 13.33.30.76, ...\n",
44-
"Connecting to huggingface.co (huggingface.co)|13.33.30.114|:443... connected.\n",
45-
"HTTP request sent, awaiting response... 302 Found\n",
46-
"Location: https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q [following]\n",
47-
"--2024-06-19 14:09:51-- https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q\n",
48-
"Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.165.102.80, 3.165.102.25, 3.165.102.95, ...\n",
49-
"Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.165.102.80|:443... connected.\n",
50-
"HTTP request sent, awaiting response... 200 OK\n",
51-
"Length: 2678607 (2.6M) [application/zip]\n",
52-
"Saving to: ‘data/sample_data.zip’\n",
53-
"\n",
54-
"sample_data.zip 100%[===================>] 2.55M 7.46MB/s in 0.3s \n",
55-
"\n",
56-
"2024-06-19 14:09:52 (7.46 MB/s) - ‘data/sample_data.zip’ saved [2678607/2678607]\n",
57-
"\n"
58-
]
59-
}
60-
],
37+
"outputs": [],
6138
"source": [
62-
"!wget https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n",
39+
"!wget https://huggingface.co/datasets/lmms-lab/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n",
6340
"!unzip data/sample_data.zip -d data"
6441
]
6542
},
@@ -107,14 +84,10 @@
10784
"\n",
10885
"features = datasets.Features(\n",
10986
" {\n",
110-
" \"question\": datasets.Value(\"string\"),\n",
11187
" \"question_id\": datasets.Value(\"int64\"),\n",
88+
" \"question\": datasets.Value(\"string\"),\n",
11289
" \"image_id\": datasets.Value(\"string\"),\n",
11390
" \"image\": datasets.Image(),\n",
114-
" \"answers\": datasets.Sequence(datasets.Sequence(feature={\"answer\": datasets.Value(\"string\"), \"answer_confidence\": datasets.Value(\"string\"), \"answer_id\": datasets.Value(\"int64\")})),\n",
115-
" \"answer_type\": datasets.Value(\"string\"),\n",
116-
" \"multiple_choice_answer\": datasets.Value(\"string\"),\n",
117-
" \"question_type\": datasets.Value(\"string\"),\n",
11891
" }\n",
11992
")"
12093
]
@@ -144,26 +117,15 @@
144117
"import json\n",
145118
"from PIL import Image\n",
146119
"\n",
147-
"KEYS = [\"question\", \"question_id\", \"image_id\", \"answers\", \"answer_type\", \"multiple_choice_answer\", \"question_type\"]\n",
148-
"\n",
149120
"def generator(qa_file, image_folder, image_prefix):\n",
150-
" # Open and load the question-answer file\n",
151121
" with open(qa_file, \"r\") as f:\n",
152122
" data = json.load(f)\n",
153123
" qa = data[\"questions\"]\n",
154124
"\n",
155125
" for q in qa:\n",
156-
" # Get the image id\n",
157126
" image_id = q[\"image_id\"]\n",
158-
" # Construct the image path\n",
159127
" image_path = os.path.join(image_folder, f\"{image_prefix}_{image_id:012}.jpg\")\n",
160-
" # Open the image and add it to the question-answer dictionary\n",
161128
" q[\"image\"] = Image.open(image_path)\n",
162-
" # Check if all keys are present in the question-answer dictionary, if not add them with None value\n",
163-
" for key in KEYS:\n",
164-
" if key not in q:\n",
165-
" q[key] = None\n",
166-
" # Yield the question-answer dictionary\n",
167129
" yield q"
168130
]
169131
},
@@ -189,33 +151,34 @@
189151
"data_val = datasets.Dataset.from_generator(\n",
190152
" generator,\n",
191153
" gen_kwargs={\n",
192-
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_val2014_questions.json\",\n",
193-
" \"image_folder\": \"data/images/val2014\",\n",
154+
" \"qa_file\": \"data/questions/vqav2_toy_questions_val2014.json\",\n",
155+
" \"image_folder\": \"data/images\",\n",
194156
" \"image_prefix\": \"COCO_val2014\",\n",
195157
" },\n",
196-
" features=features,\n",
158+
" # For this dataset, there is no need to specify the features, as all cells are non-null and all splits have the same schema\n",
159+
" # features=features,\n",
197160
" num_proc=NUM_PROC,\n",
198161
")\n",
199162
"\n",
200163
"data_test = datasets.Dataset.from_generator(\n",
201164
" generator,\n",
202165
" gen_kwargs={\n",
203-
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test2015_questions.json\",\n",
204-
" \"image_folder\": \"data/images/test2015\",\n",
166+
" \"qa_file\": \"data/questions/vqav2_toy_questions_test2015.json\",\n",
167+
" \"image_folder\": \"data/images\",\n",
205168
" \"image_prefix\": \"COCO_test2015\",\n",
206169
" },\n",
207-
" features=features,\n",
170+
" # features=features,\n",
208171
" num_proc=NUM_PROC,\n",
209172
")\n",
210173
"\n",
211174
"data_test_dev = datasets.Dataset.from_generator(\n",
212175
" generator,\n",
213176
" gen_kwargs={\n",
214-
" \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test-dev2015_questions.json\",\n",
215-
" \"image_folder\": \"data/images/test2015\",\n",
177+
" \"qa_file\": \"data/questions/vqav2_toy_questions_test-dev2015.json\",\n",
178+
" \"image_folder\": \"data/images\",\n",
216179
" \"image_prefix\": \"COCO_test2015\",\n",
217180
" },\n",
218-
" features=features,\n",
181+
" # features=features,\n",
219182
" num_proc=NUM_PROC,\n",
220183
")"
221184
]
@@ -244,35 +207,14 @@
244207
"metadata": {},
245208
"outputs": [],
246209
"source": [
247-
"data.push_to_hub(\"pufanyi/VQAv2\")"
210+
"data.push_to_hub(\"lmms-lab/VQAv2_TOY\") # replace lmms-lab to your username"
248211
]
249212
},
250213
{
251-
"cell_type": "code",
252-
"execution_count": 44,
214+
"cell_type": "markdown",
253215
"metadata": {},
254-
"outputs": [
255-
{
256-
"data": {
257-
"text/plain": [
258-
"CommitInfo(commit_url='https://huggingface.co/datasets/pufanyi/VQAv2_TOY/commit/b057eff450520a6e3fc7e6be88c3a172c4b5d99b', commit_message='Upload source_data/sample_data.zip with huggingface_hub', commit_description='', oid='b057eff450520a6e3fc7e6be88c3a172c4b5d99b', pr_url=None, pr_revision=None, pr_num=None)"
259-
]
260-
},
261-
"execution_count": 44,
262-
"metadata": {},
263-
"output_type": "execute_result"
264-
}
265-
],
266216
"source": [
267-
"from huggingface_hub import HfApi\n",
268-
"\n",
269-
"api = HfApi()\n",
270-
"api.upload_file(\n",
271-
" path_or_fileobj=\"/data/pufanyi/project/lmms-eval-public/tools/data/sample_data.zip\",\n",
272-
" path_in_repo=\"source_data/sample_data.zip\",\n",
273-
" repo_id=\"pufanyi/VQAv2_TOY\",\n",
274-
" repo_type=\"dataset\",\n",
275-
")"
217+
"Now, you can check the dataset on the [Hugging Face dataset hub](https://huggingface.co/datasets/lmms-lab/VQAv2_TOY)."
276218
]
277219
},
278220
{

0 commit comments

Comments
 (0)