{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "import pandas as pd\n", "from tqdm import tqdm\n", "from lavis.common.utils import get_abs_path, get_cache_path" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "caption a very typical bus station\n", "path /export/home/.cache/lavis/conceptual_caption/i...\n", "dataset cc3m\n", "mimetype image/jpeg\n", "size 36078\n", "status 200\n", "url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n", "Name: 0, dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cc3m.iloc[0]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3318333" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cc3m)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Found 2759017 valid records\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "cnt = 0\n", "\n", "valid_records = []\n", "\n", "for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n", " path = str(path)\n", " if os.path.exists(path):\n", " record = cc3m.iloc[i]\n", " valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n", "\n", " cnt += 1\n", "\n", "print(\"Found {} valid records\".format(cnt))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2759017" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(valid_records)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n", " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valid_records[1]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n" ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." ] } ], "source": [ "from omegaconf import OmegaConf\n", "\n", "\n", "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n", "\n", "ann_path = OmegaConf.load(\n", " config_path\n", ").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n", "\n", "ann_path = get_cache_path(ann_path)\n", "\n", "if os.path.exists(ann_path):\n", " # abort\n", " print(\"{} already exists\".format(ann_path))\n", "else:\n", " # Save the valid records to a json file\n", " with open(ann_path, \"w\") as f:\n", " f.write(json.dumps(valid_records))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.10 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" } } }, "nbformat": 4, "nbformat_minor": 2 }