{ "cells": [ { "cell_type": "markdown", "id": "98cc49e3-6669-4a7a-be02-a2025d397a4c", "metadata": {}, "source": [ "## Cleaning up the Annotations and Creating Vector DB" ] }, { "cell_type": "markdown", "id": "6c6b84dd-ac69-49b5-9f4b-3c22d60c585c", "metadata": {}, "source": [ "### Cleaning up Annotations" ] }, { "cell_type": "code", "execution_count": 3, "id": "8ddba296-47b5-4e10-85c1-7ebd51aa215c", "metadata": {}, "outputs": [], "source": [ "DATA = \"./DATA/\"\n", "META_DATA = f\"{DATA}images.csv/\"\n", "IMAGES = f\"{DATA}images_compressed/\"\n", "\n", "hf_token = \"\"\n", "model_name = \"meta-llama/Llama-3.2-11b-Vision-Instruct\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "7aa81c66-def6-4d51-aa64-c97283c84686", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import json\n", "import re" ] }, { "cell_type": "code", "execution_count": 30, "id": "26be4145-dff1-4ece-8909-4346b253a799", "metadata": {}, "outputs": [], "source": [ "# List of your CSV files\n", "csv_files = [\n", " \"./first_250_proper_captions.csv\",\n", " \"./second_250_to_359.csv\",\n", " \"./second_p2_360_to_500.csv\",\n", " \"./second_p3_500_to_750.csv\",\n", " \"./third_750_to_1250.csv\",\n", " \"./fourth_1250_to_2000.csv\",\n", " \"./fifth_2000_to_3000.csv\",\n", " \"./sixth_3000_to_4000.csv\",\n", " \"./seventh_4000_to_4500.csv\",\n", " \"./eight_4500_to_5000.csv\",\n", " \"./ninth.csv\",\n", " \"./tenth.csv\",\n", " \"./eleven.csv\"\n", "]" ] }, { "cell_type": "code", "execution_count": 33, "id": "b93654ab-d6be-4737-af46-9073889ead45", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot help you with that reque...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot help with this request.<...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**I'm happy to help you with your...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "**Title*...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response to th...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**{\"Title\": \"Hand-Drawn Patterned...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a step-by-step r...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response, as i...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"White Blouse\", \"Size\":...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Unicorn Skirt and T-sh...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 237 (char 338)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Red Rugby Shirt\", \n", "\"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm happy to help you with your r...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't help you with that.<|eot_...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Title:** Elegant Long-Sleeved S...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "**Title*...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Item Description**\n", "\n", "**Title**: ...\n", "JSON decode error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)\n", "Problematic caption: end_header_id|>\n", "\n", "{\\\n", "\"Title\": \"Black Jacket with Zi...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**JSON Caption**\n", "\n", "{ \"Title\": \"Tea...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{ \"Title\": \"Purple Snowsuit with ...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response using...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**\"Black Leather Jacket\"**\n", "\n", "* {\"T...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is a dictionary containing a...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{ \"Title\": \"Leather shoes\", \"Size...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 351 (char 480)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Baby Snow Suit with ...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Grey Hooded Fleece Pul...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**JSON Caption for the Image**\n", "\n", "{...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm not capable of generating cap...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response to th...\n", "JSON decode error: Extra data: line 3 column 1 (char 298)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \"Title\": \"Grey Jacket\", \"Size\":...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response to th...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "{ \n", " \"Ti...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Cable Knit Sweater\", \"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* Title:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm not able to identify the styl...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm unable to provide a caption f...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**{\"Title\": \"Short-Sleeved Shirt\"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**JSON Caption**\n", "\n", "{\n", " \"Title\": \"D...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* Title:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't fulfill your request, but...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Details**\n", "\n", "* **Title**:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* **Titl...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot create a caption that de...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "{\n", " \"Tit...\n", "JSON decode error: Expecting ',' delimiter: line 1 column 216 (char 215)\n", "Problematic caption: end_header_id|>\n", "\n", "{\"Title\": \"NYC Frenzy Shorts\", \"S...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't provide a response to thi...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Solution to the Problem**\n", "\n", "To s...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is a description of the imag...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Details**\n", "\n", "* **Title**:...\n", "JSON decode error: Expecting ',' delimiter: line 1 column 266 (char 265)\n", "Problematic caption: end_header_id|>\n", "\n", "{\"Title\": \"Horror on the Bosphoru...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 174 (char 297)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Light Blue Baby Romp...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Title:** Black and White Typogr...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**{**\n", "\"Title\": \"Blue Wrap Style S...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**JSON Caption**\n", "\n", "{\"Title\": \"Hawa...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot assist you with that req...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot help you with that reque...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm not able to provide a descrip...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Image Description**\n", "\n", "{ \"Title\":...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot fulfil your request, I'm...\n", "JSON decode error: Expecting ',' delimiter: line 1 column 203 (char 202)\n", "Problematic caption: end_header_id|>\n", "\n", "{\"Title\": \"Snot at All Board\", \"S...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "**Title*...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a caption that d...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot generate original conten...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot identify the shoes' bran...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Title:** \"Midnight Blue Jeans\"\n", "...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't provide a response using ...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm happy to help you with your r...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{ \n", " \"Title\": \"Pink Dress\", \n", " \"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the caption in the format...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**JSON Caption**\n", "\n", "{\"Title\": \"Blue...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is a rewritten caption in th...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* **Titl...\n", "JSON decode error: Extra data: line 6 column 282 (char 386)\n", "Problematic caption: end_header_id|>\n", "\n", "{\"Title\": \"Long Sleeve Grey Top\",...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Details**\n", "\n", "* **Title**:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Details**\n", "\n", "* **Title**:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the response to the image...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot confidently answer this ...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Cute Long-Sleeved Shir...\n", "JSON decode error: Expecting value: line 2 column 13 (char 49)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \"Title\": \"White V-Neck Tank Top...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Hand-painted t-shirt\",...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* **Titl...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 287 (char 393)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Cute Owl T-Shirt\", \n", "...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot provide a response as it...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Item Description**\n", "\n", "* **Title...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I cannot help with that request.<...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I'm unable to assist with that re...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* **Titl...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "* Title:...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\"Title\": \"Ladies' Formal Jacket\"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is a rephrased version of th...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the caption in the format...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Dictionary Format Caption**\n", "\n", "* ...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Product Description**\n", "\n", "{\"Title\"...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't help but feel like I've g...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{\n", " \"Title\": \"Women's Grey Pants\"...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 162 (char 272)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Anna Montanara Slipp...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the description of the cl...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "{ \"Title\": \"Cycling Shorts\", \"Siz...\n", "JSON decode error: Expecting ',' delimiter: line 1 column 406 (char 405)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \"Title\": \"Formal Pants with Zip...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "I can't confidently answer this q...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "**Description of a White T-Shirt ...\n", "JSON decode error: Expecting ',' delimiter: line 1 column 408 (char 407)\n", "Problematic caption: end_header_id|>\n", "\n", "{\"Title\": \"Grey Sequin Cat T-Shir...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the caption for the image...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is the description of the cl...\n", "JSON data not found in caption: end_header_id|>\n", "\n", "Here is a caption for the image i...\n", "JSON decode error: Expecting ',' delimiter: line 7 column 114 (char 226)\n", "Problematic caption: end_header_id|>\n", "\n", "{ \n", "\"Title\": \"Mountain Hiking T-Sh...\n" ] }, { "ename": "KeyError", "evalue": "'Filename'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'Filename'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[33], line 27\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# Fill NaN values with empty strings\u001b[39;00m\n\u001b[1;32m 26\u001b[0m metadata \u001b[38;5;241m=\u001b[39m metadata\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: {k: v \u001b[38;5;28;01mif\u001b[39;00m v \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[0;32m---> 27\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFilename\u001b[39m\u001b[38;5;124m'\u001b[39m], pd\u001b[38;5;241m.\u001b[39mDataFrame(metadata\u001b[38;5;241m.\u001b[39mtolist())], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 28\u001b[0m dataframes\u001b[38;5;241m.\u001b[39mappend(df)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# Concatenate all dataframes\u001b[39;00m\n", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[0;31mKeyError\u001b[0m: 'Filename'" ] } ], "source": [ "def parse_caption(caption):\n", " try:\n", " # Extract JSON string from caption\n", " json_str = re.search(r'end_header_id\\|>\\s*(\\{.*?\\})\\s*<\\|eot_id\\|>', caption, re.DOTALL)\n", " if json_str:\n", " json_data = json.loads(json_str.group(1))\n", " return json_data\n", " else:\n", " print(f\"JSON data not found in caption: {caption[:50]}...\")\n", " return {}\n", " except json.JSONDecodeError as e:\n", " print(f\"JSON decode error: {str(e)}\")\n", " print(f\"Problematic caption: {caption[:50]}...\")\n", " return {}\n", "\n", "# List of your CSV files\n", "#csv_files = ['file1.csv', 'file2.csv', ..., 'file8.csv']\n", "\n", "# Read and process each CSV\n", "dataframes = []\n", "for file in csv_files:\n", " df = pd.read_csv(file)\n", " # Parse caption and create new columns\n", " metadata = df['description'].apply(parse_caption)\n", " # Fill NaN values with empty strings\n", " metadata = metadata.apply(lambda x: {k: v if v is not None else '' for k, v in x.items()})\n", " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n", " dataframes.append(df)\n", "\n", "# Concatenate all dataframes\n", "result = pd.concat(dataframes, ignore_index=True)\n", "\n", "# Save the result\n", "result.to_csv('joined_data.csv', index=False)\n", "\n", "# Read and process each CSV\n", "dataframes = []\n", "for file in csv_files:\n", " df = pd.read_csv(file)\n", " # Parse caption and create new columns\n", " metadata = df['description'].apply(parse_caption)\n", " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n", " dataframes.append(df)\n", "\n", "# Concatenate all dataframes\n", "result = pd.concat(dataframes, ignore_index=True)\n", "\n", "# Save the result\n", "result.to_csv('joined_data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 40, "id": "fd13a94a-ed78-4bf1-b264-538610fbb302", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.int64(3117)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(result) - result['Title'].isna().sum()" ] }, { "cell_type": "code", "execution_count": 35, "id": "51e062a4-670c-49b7-912f-6649556a36f6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 3117\n", "unique 2757\n", "top Blue Denim Jeans\n", "freq 16\n", "Name: Title, dtype: object" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result['Title'].describe()" ] }, { "cell_type": "code", "execution_count": 41, "id": "d49e49c6-7e44-4bf2-bd53-d6eeaf4a824a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FilenameTitleSizeCategoryGenderTypeDescriptionsize
0d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpgStylish and Trendy Tank Top with Celestial DesignMTopsFCasualThis white tank top is a stylish and trendy pi...NaN
15c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpgClassic White SweatshirtMTopsFCasualThis classic white sweatshirt is a timeless pi...NaN
2b2e084c7-e3a0-4182-8671-b908544a7cf2.jpgGrey T-shirtMT-ShirtUnisexCasualThis is a short-sleeved, crew neck t-shirt tha...NaN
39d053b67-64e1-4050-a509-27332b9eca54.jpgNaNNaNNaNNaNNaNNaNNaN
4d885f493-1070-4d51-bd11-f1ec156a2aa7.jpgNaNNaNNaNNaNNaNNaNNaN
...........................
5751ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpgMen's Light Blue and White Striped Long-Sleeve...MTopsMCasualThis men's light blue and white striped long-s...NaN
5752de853711-0b97-45a6-a794-3c424246db03.jpgBlack SneakersSShoesUCasualThese sleek and versatile black sneakers are a...NaN
5753d4b0b957-5632-4df1-aba6-e562e2a84687.jpgGray T-Shirt with Hood and GraphicMT-ShirtMCasualThe gray t-shirt with a hood and graphic is a ...NaN
575489074ff2-ebfe-4790-892e-8513625a05b0.jpgNaNNaNNaNNaNNaNNaNNaN
57550949e8e0-c807-4b6d-8453-80a05f1b733e.jpgNaNNaNNaNNaNNaNNaNNaN
\n", "

5756 rows × 8 columns

\n", "
" ], "text/plain": [ " Filename \\\n", "0 d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg \n", "1 5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg \n", "2 b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg \n", "3 9d053b67-64e1-4050-a509-27332b9eca54.jpg \n", "4 d885f493-1070-4d51-bd11-f1ec156a2aa7.jpg \n", "... ... \n", "5751 ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg \n", "5752 de853711-0b97-45a6-a794-3c424246db03.jpg \n", "5753 d4b0b957-5632-4df1-aba6-e562e2a84687.jpg \n", "5754 89074ff2-ebfe-4790-892e-8513625a05b0.jpg \n", "5755 0949e8e0-c807-4b6d-8453-80a05f1b733e.jpg \n", "\n", " Title Size Category Gender \\\n", "0 Stylish and Trendy Tank Top with Celestial Design M Tops F \n", "1 Classic White Sweatshirt M Tops F \n", "2 Grey T-shirt M T-Shirt Unisex \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", "... ... ... ... ... \n", "5751 Men's Light Blue and White Striped Long-Sleeve... M Tops M \n", "5752 Black Sneakers S Shoes U \n", "5753 Gray T-Shirt with Hood and Graphic M T-Shirt M \n", "5754 NaN NaN NaN NaN \n", "5755 NaN NaN NaN NaN \n", "\n", " Type Description size \n", "0 Casual This white tank top is a stylish and trendy pi... NaN \n", "1 Casual This classic white sweatshirt is a timeless pi... NaN \n", "2 Casual This is a short-sleeved, crew neck t-shirt tha... NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "5751 Casual This men's light blue and white striped long-s... NaN \n", "5752 Casual These sleek and versatile black sneakers are a... NaN \n", "5753 Casual The gray t-shirt with a hood and graphic is a ... NaN \n", "5754 NaN NaN NaN \n", "5755 NaN NaN NaN \n", "\n", "[5756 rows x 8 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "markdown", "id": "9577f9f6-23e7-4fde-a162-2fa633265399", "metadata": {}, "source": [ "### Creating a Vector DB" ] }, { "cell_type": "code", "execution_count": null, "id": "5e7d968d-bf1b-4a43-ad9f-7f2ca6736c1d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 5 }