|
28 | 28 | }, |
29 | 29 | { |
30 | 30 | "cell_type": "code", |
31 | | - "execution_count": 45, |
| 31 | + "execution_count": null, |
32 | 32 | "metadata": { |
33 | 33 | "vscode": { |
34 | 34 | "languageId": "bat" |
35 | 35 | } |
36 | 36 | }, |
37 | | - "outputs": [ |
38 | | - { |
39 | | - "name": "stdout", |
40 | | - "output_type": "stream", |
41 | | - "text": [ |
42 | | - "--2024-06-19 14:09:51-- https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip\n", |
43 | | - "Resolving huggingface.co (huggingface.co)... 13.33.30.114, 13.33.30.49, 13.33.30.76, ...\n", |
44 | | - "Connecting to huggingface.co (huggingface.co)|13.33.30.114|:443... connected.\n", |
45 | | - "HTTP request sent, awaiting response... 302 Found\n", |
46 | | - "Location: https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q [following]\n", |
47 | | - "--2024-06-19 14:09:51-- https://cdn-lfs-us-1.huggingface.co/repos/c9/82/c9827770a5c0b13c1b646a275968813f8705db30ac0de29f118bb316c2b2a4eb/8cc2e821b7c6e4b5726a6feeb6214cd2d4810d53f568a5f3565d78e6d1ee5403?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27sample_data.zip%3B+filename%3D%22sample_data.zip%22%3B&response-content-type=application%2Fzip&Expires=1719036591&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxOTAzNjU5MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2M5LzgyL2M5ODI3NzcwYTVjMGIxM2MxYjY0NmEyNzU5Njg4MTNmODcwNWRiMzBhYzBkZTI5ZjExOGJiMzE2YzJiMmE0ZWIvOGNjMmU4MjFiN2M2ZTRiNTcyNmE2ZmVlYjYyMTRjZDJkNDgxMGQ1M2Y1NjhhNWYzNTY1ZDc4ZTZkMWVlNTQwMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=kppoby2Wg9BYA-L2HJ0uShfMSULqTXjtN3cbdBdZTvMf4NvNXBJxc0mcPSiz-sqV7d7hJn32IzHze2JnnTGxrVrozYdHeoTuG0EtF%7ERgQz17PbzbEps-MPzl-h4G9d5RImWDBNN3OYTWyvSxFzn12d-owQKrkdEXejUZEkGdzvHgECzLPpuMw%7EXIctwxBBbxrHRtBNU57K2KBwOqw5rujHtQevhMaCeRgxRFlpfc3FDxsl4rUVHrCM79UhPwutpEAtOh%7Ep6%7EdgLOXal6oZKCnejCQg3AjgvuMe4Eot3J37a7yUGToRtx6XX8Q9I1SC2nScXIWwZndOQY-1VNSL1s-A__&Key-Pair-Id=K2FPYV99P2N66Q\n", |
48 | | - "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.165.102.80, 3.165.102.25, 3.165.102.95, ...\n", |
49 | | - "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.165.102.80|:443... connected.\n", |
50 | | - "HTTP request sent, awaiting response... 200 OK\n", |
51 | | - "Length: 2678607 (2.6M) [application/zip]\n", |
52 | | - "Saving to: ‘data/sample_data.zip’\n", |
53 | | - "\n", |
54 | | - "sample_data.zip 100%[===================>] 2.55M 7.46MB/s in 0.3s \n", |
55 | | - "\n", |
56 | | - "2024-06-19 14:09:52 (7.46 MB/s) - ‘data/sample_data.zip’ saved [2678607/2678607]\n", |
57 | | - "\n" |
58 | | - ] |
59 | | - } |
60 | | - ], |
| 37 | + "outputs": [], |
61 | 38 | "source": [ |
62 | | - "!wget https://huggingface.co/datasets/pufanyi/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n", |
| 39 | + "!wget https://huggingface.co/datasets/lmms-lab/VQAv2_TOY/resolve/main/source_data/sample_data.zip -P data\n", |
63 | 40 | "!unzip data/sample_data.zip -d data" |
64 | 41 | ] |
65 | 42 | }, |
|
107 | 84 | "\n", |
108 | 85 | "features = datasets.Features(\n", |
109 | 86 | " {\n", |
110 | | - " \"question\": datasets.Value(\"string\"),\n", |
111 | 87 | " \"question_id\": datasets.Value(\"int64\"),\n", |
| 88 | + " \"question\": datasets.Value(\"string\"),\n", |
112 | 89 | " \"image_id\": datasets.Value(\"string\"),\n", |
113 | 90 | " \"image\": datasets.Image(),\n", |
114 | | - " \"answers\": datasets.Sequence(datasets.Sequence(feature={\"answer\": datasets.Value(\"string\"), \"answer_confidence\": datasets.Value(\"string\"), \"answer_id\": datasets.Value(\"int64\")})),\n", |
115 | | - " \"answer_type\": datasets.Value(\"string\"),\n", |
116 | | - " \"multiple_choice_answer\": datasets.Value(\"string\"),\n", |
117 | | - " \"question_type\": datasets.Value(\"string\"),\n", |
118 | 91 | " }\n", |
119 | 92 | ")" |
120 | 93 | ] |
|
144 | 117 | "import json\n", |
145 | 118 | "from PIL import Image\n", |
146 | 119 | "\n", |
147 | | - "KEYS = [\"question\", \"question_id\", \"image_id\", \"answers\", \"answer_type\", \"multiple_choice_answer\", \"question_type\"]\n", |
148 | | - "\n", |
149 | 120 | "def generator(qa_file, image_folder, image_prefix):\n", |
150 | | - " # Open and load the question-answer file\n", |
151 | 121 | " with open(qa_file, \"r\") as f:\n", |
152 | 122 | " data = json.load(f)\n", |
153 | 123 | " qa = data[\"questions\"]\n", |
154 | 124 | "\n", |
155 | 125 | " for q in qa:\n", |
156 | | - " # Get the image id\n", |
157 | 126 | " image_id = q[\"image_id\"]\n", |
158 | | - " # Construct the image path\n", |
159 | 127 | " image_path = os.path.join(image_folder, f\"{image_prefix}_{image_id:012}.jpg\")\n", |
160 | | - " # Open the image and add it to the question-answer dictionary\n", |
161 | 128 | " q[\"image\"] = Image.open(image_path)\n", |
162 | | - " # Check if all keys are present in the question-answer dictionary, if not add them with None value\n", |
163 | | - " for key in KEYS:\n", |
164 | | - " if key not in q:\n", |
165 | | - " q[key] = None\n", |
166 | | - " # Yield the question-answer dictionary\n", |
167 | 129 | " yield q" |
168 | 130 | ] |
169 | 131 | }, |
|
189 | 151 | "data_val = datasets.Dataset.from_generator(\n", |
190 | 152 | " generator,\n", |
191 | 153 | " gen_kwargs={\n", |
192 | | - " \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_val2014_questions.json\",\n", |
193 | | - " \"image_folder\": \"data/images/val2014\",\n", |
| 154 | + " \"qa_file\": \"data/questions/vqav2_toy_questions_val2014.json\",\n", |
| 155 | + " \"image_folder\": \"data/images\",\n", |
194 | 156 | " \"image_prefix\": \"COCO_val2014\",\n", |
195 | 157 | " },\n", |
196 | | - " features=features,\n", |
| 158 | + " # For this dataset, there is no need to specify the features, as all cells are non-null and all splits have the same schema\n", |
| 159 | + " # features=features,\n", |
197 | 160 | " num_proc=NUM_PROC,\n", |
198 | 161 | ")\n", |
199 | 162 | "\n", |
200 | 163 | "data_test = datasets.Dataset.from_generator(\n", |
201 | 164 | " generator,\n", |
202 | 165 | " gen_kwargs={\n", |
203 | | - " \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test2015_questions.json\",\n", |
204 | | - " \"image_folder\": \"data/images/test2015\",\n", |
| 166 | + " \"qa_file\": \"data/questions/vqav2_toy_questions_test2015.json\",\n", |
| 167 | + " \"image_folder\": \"data/images\",\n", |
205 | 168 | " \"image_prefix\": \"COCO_test2015\",\n", |
206 | 169 | " },\n", |
207 | | - " features=features,\n", |
| 170 | + " # features=features,\n", |
208 | 171 | " num_proc=NUM_PROC,\n", |
209 | 172 | ")\n", |
210 | 173 | "\n", |
211 | 174 | "data_test_dev = datasets.Dataset.from_generator(\n", |
212 | 175 | " generator,\n", |
213 | 176 | " gen_kwargs={\n", |
214 | | - " \"qa_file\": \"data/questions/v2_OpenEnded_mscoco_test-dev2015_questions.json\",\n", |
215 | | - " \"image_folder\": \"data/images/test2015\",\n", |
| 177 | + " \"qa_file\": \"data/questions/vqav2_toy_questions_test-dev2015.json\",\n", |
| 178 | + " \"image_folder\": \"data/images\",\n", |
216 | 179 | " \"image_prefix\": \"COCO_test2015\",\n", |
217 | 180 | " },\n", |
218 | | - " features=features,\n", |
| 181 | + " # features=features,\n", |
219 | 182 | " num_proc=NUM_PROC,\n", |
220 | 183 | ")" |
221 | 184 | ] |
|
244 | 207 | "metadata": {}, |
245 | 208 | "outputs": [], |
246 | 209 | "source": [ |
247 | | - "data.push_to_hub(\"pufanyi/VQAv2\")" |
| 210 | + "data.push_to_hub(\"lmms-lab/VQAv2_TOY\") # replace lmms-lab to your username" |
248 | 211 | ] |
249 | 212 | }, |
250 | 213 | { |
251 | | - "cell_type": "code", |
252 | | - "execution_count": 44, |
| 214 | + "cell_type": "markdown", |
253 | 215 | "metadata": {}, |
254 | | - "outputs": [ |
255 | | - { |
256 | | - "data": { |
257 | | - "text/plain": [ |
258 | | - "CommitInfo(commit_url='https://huggingface.co/datasets/pufanyi/VQAv2_TOY/commit/b057eff450520a6e3fc7e6be88c3a172c4b5d99b', commit_message='Upload source_data/sample_data.zip with huggingface_hub', commit_description='', oid='b057eff450520a6e3fc7e6be88c3a172c4b5d99b', pr_url=None, pr_revision=None, pr_num=None)" |
259 | | - ] |
260 | | - }, |
261 | | - "execution_count": 44, |
262 | | - "metadata": {}, |
263 | | - "output_type": "execute_result" |
264 | | - } |
265 | | - ], |
266 | 216 | "source": [ |
267 | | - "from huggingface_hub import HfApi\n", |
268 | | - "\n", |
269 | | - "api = HfApi()\n", |
270 | | - "api.upload_file(\n", |
271 | | - " path_or_fileobj=\"/data/pufanyi/project/lmms-eval-public/tools/data/sample_data.zip\",\n", |
272 | | - " path_in_repo=\"source_data/sample_data.zip\",\n", |
273 | | - " repo_id=\"pufanyi/VQAv2_TOY\",\n", |
274 | | - " repo_type=\"dataset\",\n", |
275 | | - ")" |
| 217 | + "Now, you can check the dataset on the [Hugging Face dataset hub](https://huggingface.co/datasets/lmms-lab/VQAv2_TOY)." |
276 | 218 | ] |
277 | 219 | }, |
278 | 220 | { |
|
0 commit comments