| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "id": "98cc49e3-6669-4a7a-be02-a2025d397a4c",
- "metadata": {},
- "source": [
- "## Cleaning up the Annotations and Creating Vector DB"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6c6b84dd-ac69-49b5-9f4b-3c22d60c585c",
- "metadata": {},
- "source": [
- "### Cleaning up Annotations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "8ddba296-47b5-4e10-85c1-7ebd51aa215c",
- "metadata": {},
- "outputs": [],
- "source": [
- "DATA = \"./DATA/\"\n",
- "META_DATA = f\"{DATA}images.csv/\"\n",
- "IMAGES = f\"{DATA}images_compressed/\"\n",
- "\n",
- "hf_token = \"\"\n",
- "model_name = \"meta-llama/Llama-3.2-11b-Vision-Instruct\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "7aa81c66-def6-4d51-aa64-c97283c84686",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import json\n",
- "import re"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "26be4145-dff1-4ece-8909-4346b253a799",
- "metadata": {},
- "outputs": [],
- "source": [
- "# List of your CSV files\n",
- "csv_files = [\n",
- " \"./first_250_proper_captions.csv\",\n",
- " \"./second_250_to_359.csv\",\n",
- " \"./second_p2_360_to_500.csv\",\n",
- " \"./second_p3_500_to_750.csv\",\n",
- " \"./third_750_to_1250.csv\",\n",
- " \"./fourth_1250_to_2000.csv\",\n",
- " \"./fifth_2000_to_3000.csv\",\n",
- " \"./sixth_3000_to_4000.csv\",\n",
- " \"./seventh_4000_to_4500.csv\",\n",
- " \"./eight_4500_to_5000.csv\",\n",
- " \"./ninth.csv\",\n",
- " \"./tenth.csv\",\n",
- " \"./eleven.csv\"\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "b93654ab-d6be-4737-af46-9073889ead45",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot help you with that reque...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot help with this request.<...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**I'm happy to help you with your...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "**Title*...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response to th...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**{\"Title\": \"Hand-Drawn Patterned...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a step-by-step r...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response, as i...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"White Blouse\", \"Size\":...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Unicorn Skirt and T-sh...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 237 (char 338)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Red Rugby Shirt\", \n",
- "\"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm happy to help you with your r...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't help you with that.<|eot_...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Title:** Elegant Long-Sleeved S...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "**Title*...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Item Description**\n",
- "\n",
- "**Title**: ...\n",
- "JSON decode error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\\\n",
- "\"Title\": \"Black Jacket with Zi...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**JSON Caption**\n",
- "\n",
- "{ \"Title\": \"Tea...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"Purple Snowsuit with ...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response using...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**\"Black Leather Jacket\"**\n",
- "\n",
- "* {\"T...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is a dictionary containing a...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"Leather shoes\", \"Size...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 351 (char 480)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Baby Snow Suit with ...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Grey Hooded Fleece Pul...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**JSON Caption for the Image**\n",
- "\n",
- "{...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm not capable of generating cap...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response to th...\n",
- "JSON decode error: Extra data: line 3 column 1 (char 298)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"Grey Jacket\", \"Size\":...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response to th...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "{ \n",
- " \"Ti...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Cable Knit Sweater\", \"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* Title:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm not able to identify the styl...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm unable to provide a caption f...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**{\"Title\": \"Short-Sleeved Shirt\"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**JSON Caption**\n",
- "\n",
- "{\n",
- " \"Title\": \"D...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* Title:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't fulfill your request, but...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Details**\n",
- "\n",
- "* **Title**:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* **Titl...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot create a caption that de...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "{\n",
- " \"Tit...\n",
- "JSON decode error: Expecting ',' delimiter: line 1 column 216 (char 215)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"NYC Frenzy Shorts\", \"S...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't provide a response to thi...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Solution to the Problem**\n",
- "\n",
- "To s...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is a description of the imag...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Details**\n",
- "\n",
- "* **Title**:...\n",
- "JSON decode error: Expecting ',' delimiter: line 1 column 266 (char 265)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Horror on the Bosphoru...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 174 (char 297)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Light Blue Baby Romp...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Title:** Black and White Typogr...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**{**\n",
- "\"Title\": \"Blue Wrap Style S...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**JSON Caption**\n",
- "\n",
- "{\"Title\": \"Hawa...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot assist you with that req...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot help you with that reque...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm not able to provide a descrip...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Image Description**\n",
- "\n",
- "{ \"Title\":...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot fulfil your request, I'm...\n",
- "JSON decode error: Expecting ',' delimiter: line 1 column 203 (char 202)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Snot at All Board\", \"S...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "**Title*...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a caption that d...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot generate original conten...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot identify the shoes' bran...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Title:** \"Midnight Blue Jeans\"\n",
- "...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't provide a response using ...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm happy to help you with your r...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{ \n",
- " \"Title\": \"Pink Dress\", \n",
- " \"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the caption in the format...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**JSON Caption**\n",
- "\n",
- "{\"Title\": \"Blue...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is a rewritten caption in th...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* **Titl...\n",
- "JSON decode error: Extra data: line 6 column 282 (char 386)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Long Sleeve Grey Top\",...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Details**\n",
- "\n",
- "* **Title**:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Details**\n",
- "\n",
- "* **Title**:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the response to the image...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot confidently answer this ...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Cute Long-Sleeved Shir...\n",
- "JSON decode error: Expecting value: line 2 column 13 (char 49)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"White V-Neck Tank Top...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Hand-painted t-shirt\",...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* **Titl...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 287 (char 393)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Cute Owl T-Shirt\", \n",
- "...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot provide a response as it...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Item Description**\n",
- "\n",
- "* **Title...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I cannot help with that request.<...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I'm unable to assist with that re...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* **Titl...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "* Title:...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Ladies' Formal Jacket\"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is a rephrased version of th...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the caption in the format...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Dictionary Format Caption**\n",
- "\n",
- "* ...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Product Description**\n",
- "\n",
- "{\"Title\"...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't help but feel like I've g...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{\n",
- " \"Title\": \"Women's Grey Pants\"...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 162 (char 272)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Anna Montanara Slipp...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the description of the cl...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"Cycling Shorts\", \"Siz...\n",
- "JSON decode error: Expecting ',' delimiter: line 1 column 406 (char 405)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \"Title\": \"Formal Pants with Zip...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "I can't confidently answer this q...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "**Description of a White T-Shirt ...\n",
- "JSON decode error: Expecting ',' delimiter: line 1 column 408 (char 407)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{\"Title\": \"Grey Sequin Cat T-Shir...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the caption for the image...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is the description of the cl...\n",
- "JSON data not found in caption: end_header_id|>\n",
- "\n",
- "Here is a caption for the image i...\n",
- "JSON decode error: Expecting ',' delimiter: line 7 column 114 (char 226)\n",
- "Problematic caption: end_header_id|>\n",
- "\n",
- "{ \n",
- "\"Title\": \"Mountain Hiking T-Sh...\n"
- ]
- },
- {
- "ename": "KeyError",
- "evalue": "'Filename'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
- "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
- "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
- "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
- "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;31mKeyError\u001b[0m: 'Filename'",
- "\nThe above exception was the direct cause of the following exception:\n",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[33], line 27\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# Fill NaN values with empty strings\u001b[39;00m\n\u001b[1;32m 26\u001b[0m metadata \u001b[38;5;241m=\u001b[39m metadata\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: {k: v \u001b[38;5;28;01mif\u001b[39;00m v \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[0;32m---> 27\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFilename\u001b[39m\u001b[38;5;124m'\u001b[39m], pd\u001b[38;5;241m.\u001b[39mDataFrame(metadata\u001b[38;5;241m.\u001b[39mtolist())], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 28\u001b[0m dataframes\u001b[38;5;241m.\u001b[39mappend(df)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# Concatenate all dataframes\u001b[39;00m\n",
- "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
- "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
- "\u001b[0;31mKeyError\u001b[0m: 'Filename'"
- ]
- }
- ],
- "source": [
- "def parse_caption(caption):\n",
- " try:\n",
- " # Extract JSON string from caption\n",
- " json_str = re.search(r'end_header_id\\|>\\s*(\\{.*?\\})\\s*<\\|eot_id\\|>', caption, re.DOTALL)\n",
- " if json_str:\n",
- " json_data = json.loads(json_str.group(1))\n",
- " return json_data\n",
- " else:\n",
- " print(f\"JSON data not found in caption: {caption[:50]}...\")\n",
- " return {}\n",
- " except json.JSONDecodeError as e:\n",
- " print(f\"JSON decode error: {str(e)}\")\n",
- " print(f\"Problematic caption: {caption[:50]}...\")\n",
- " return {}\n",
- "\n",
- "# List of your CSV files\n",
- "#csv_files = ['file1.csv', 'file2.csv', ..., 'file8.csv']\n",
- "\n",
- "# Read and process each CSV\n",
- "dataframes = []\n",
- "for file in csv_files:\n",
- " df = pd.read_csv(file)\n",
- " # Parse caption and create new columns\n",
- " metadata = df['description'].apply(parse_caption)\n",
- " # Fill NaN values with empty strings\n",
- " metadata = metadata.apply(lambda x: {k: v if v is not None else '' for k, v in x.items()})\n",
- " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n",
- " dataframes.append(df)\n",
- "\n",
- "# Concatenate all dataframes\n",
- "result = pd.concat(dataframes, ignore_index=True)\n",
- "\n",
- "# Save the result\n",
- "result.to_csv('joined_data.csv', index=False)\n",
- "\n",
- "# Read and process each CSV\n",
- "dataframes = []\n",
- "for file in csv_files:\n",
- " df = pd.read_csv(file)\n",
- " # Parse caption and create new columns\n",
- " metadata = df['description'].apply(parse_caption)\n",
- " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n",
- " dataframes.append(df)\n",
- "\n",
- "# Concatenate all dataframes\n",
- "result = pd.concat(dataframes, ignore_index=True)\n",
- "\n",
- "# Save the result\n",
- "result.to_csv('joined_data.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "fd13a94a-ed78-4bf1-b264-538610fbb302",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "np.int64(3117)"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(result) - result['Title'].isna().sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "51e062a4-670c-49b7-912f-6649556a36f6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "count 3117\n",
- "unique 2757\n",
- "top Blue Denim Jeans\n",
- "freq 16\n",
- "Name: Title, dtype: object"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "result['Title'].describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "d49e49c6-7e44-4bf2-bd53-d6eeaf4a824a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Filename</th>\n",
- " <th>Title</th>\n",
- " <th>Size</th>\n",
- " <th>Category</th>\n",
- " <th>Gender</th>\n",
- " <th>Type</th>\n",
- " <th>Description</th>\n",
- " <th>size</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg</td>\n",
- " <td>Stylish and Trendy Tank Top with Celestial Design</td>\n",
- " <td>M</td>\n",
- " <td>Tops</td>\n",
- " <td>F</td>\n",
- " <td>Casual</td>\n",
- " <td>This white tank top is a stylish and trendy pi...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg</td>\n",
- " <td>Classic White Sweatshirt</td>\n",
- " <td>M</td>\n",
- " <td>Tops</td>\n",
- " <td>F</td>\n",
- " <td>Casual</td>\n",
- " <td>This classic white sweatshirt is a timeless pi...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg</td>\n",
- " <td>Grey T-shirt</td>\n",
- " <td>M</td>\n",
- " <td>T-Shirt</td>\n",
- " <td>Unisex</td>\n",
- " <td>Casual</td>\n",
- " <td>This is a short-sleeved, crew neck t-shirt tha...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>9d053b67-64e1-4050-a509-27332b9eca54.jpg</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>d885f493-1070-4d51-bd11-f1ec156a2aa7.jpg</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5751</th>\n",
- " <td>ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg</td>\n",
- " <td>Men's Light Blue and White Striped Long-Sleeve...</td>\n",
- " <td>M</td>\n",
- " <td>Tops</td>\n",
- " <td>M</td>\n",
- " <td>Casual</td>\n",
- " <td>This men's light blue and white striped long-s...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5752</th>\n",
- " <td>de853711-0b97-45a6-a794-3c424246db03.jpg</td>\n",
- " <td>Black Sneakers</td>\n",
- " <td>S</td>\n",
- " <td>Shoes</td>\n",
- " <td>U</td>\n",
- " <td>Casual</td>\n",
- " <td>These sleek and versatile black sneakers are a...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5753</th>\n",
- " <td>d4b0b957-5632-4df1-aba6-e562e2a84687.jpg</td>\n",
- " <td>Gray T-Shirt with Hood and Graphic</td>\n",
- " <td>M</td>\n",
- " <td>T-Shirt</td>\n",
- " <td>M</td>\n",
- " <td>Casual</td>\n",
- " <td>The gray t-shirt with a hood and graphic is a ...</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5754</th>\n",
- " <td>89074ff2-ebfe-4790-892e-8513625a05b0.jpg</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5755</th>\n",
- " <td>0949e8e0-c807-4b6d-8453-80a05f1b733e.jpg</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5756 rows × 8 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " Filename \\\n",
- "0 d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg \n",
- "1 5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg \n",
- "2 b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg \n",
- "3 9d053b67-64e1-4050-a509-27332b9eca54.jpg \n",
- "4 d885f493-1070-4d51-bd11-f1ec156a2aa7.jpg \n",
- "... ... \n",
- "5751 ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg \n",
- "5752 de853711-0b97-45a6-a794-3c424246db03.jpg \n",
- "5753 d4b0b957-5632-4df1-aba6-e562e2a84687.jpg \n",
- "5754 89074ff2-ebfe-4790-892e-8513625a05b0.jpg \n",
- "5755 0949e8e0-c807-4b6d-8453-80a05f1b733e.jpg \n",
- "\n",
- " Title Size Category Gender \\\n",
- "0 Stylish and Trendy Tank Top with Celestial Design M Tops F \n",
- "1 Classic White Sweatshirt M Tops F \n",
- "2 Grey T-shirt M T-Shirt Unisex \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "... ... ... ... ... \n",
- "5751 Men's Light Blue and White Striped Long-Sleeve... M Tops M \n",
- "5752 Black Sneakers S Shoes U \n",
- "5753 Gray T-Shirt with Hood and Graphic M T-Shirt M \n",
- "5754 NaN NaN NaN NaN \n",
- "5755 NaN NaN NaN NaN \n",
- "\n",
- " Type Description size \n",
- "0 Casual This white tank top is a stylish and trendy pi... NaN \n",
- "1 Casual This classic white sweatshirt is a timeless pi... NaN \n",
- "2 Casual This is a short-sleeved, crew neck t-shirt tha... NaN \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "... ... ... ... \n",
- "5751 Casual This men's light blue and white striped long-s... NaN \n",
- "5752 Casual These sleek and versatile black sneakers are a... NaN \n",
- "5753 Casual The gray t-shirt with a hood and graphic is a ... NaN \n",
- "5754 NaN NaN NaN \n",
- "5755 NaN NaN NaN \n",
- "\n",
- "[5756 rows x 8 columns]"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9577f9f6-23e7-4fde-a162-2fa633265399",
- "metadata": {},
- "source": [
- "### Creating a Vector DB"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5e7d968d-bf1b-4a43-ad9f-7f2ca6736c1d",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
- }
|