Part_2_Cleaning_Data_and_DB 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "id": "98cc49e3-6669-4a7a-be02-a2025d397a4c",
  6. "metadata": {},
  7. "source": [
  8. "## Cleaning up the Annotations and Creating Vector DB"
  9. ]
  10. },
  11. {
  12. "cell_type": "markdown",
  13. "id": "6c6b84dd-ac69-49b5-9f4b-3c22d60c585c",
  14. "metadata": {},
  15. "source": [
  16. "### Cleaning up Annotations"
  17. ]
  18. },
  19. {
  20. "cell_type": "code",
  21. "execution_count": 3,
  22. "id": "8ddba296-47b5-4e10-85c1-7ebd51aa215c",
  23. "metadata": {},
  24. "outputs": [],
  25. "source": [
  26. "DATA = \"./DATA/\"\n",
  27. "META_DATA = f\"{DATA}images.csv/\"\n",
  28. "IMAGES = f\"{DATA}images_compressed/\"\n",
  29. "\n",
  30. "hf_token = \"\"\n",
  31. "model_name = \"meta-llama/Llama-3.2-11b-Vision-Instruct\""
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": 18,
  37. "id": "7aa81c66-def6-4d51-aa64-c97283c84686",
  38. "metadata": {},
  39. "outputs": [],
  40. "source": [
  41. "import pandas as pd\n",
  42. "import numpy as np\n",
  43. "import json\n",
  44. "import re"
  45. ]
  46. },
  47. {
  48. "cell_type": "code",
  49. "execution_count": 30,
  50. "id": "26be4145-dff1-4ece-8909-4346b253a799",
  51. "metadata": {},
  52. "outputs": [],
  53. "source": [
  54. "# List of your CSV files\n",
  55. "csv_files = [\n",
  56. " \"./first_250_proper_captions.csv\",\n",
  57. " \"./second_250_to_359.csv\",\n",
  58. " \"./second_p2_360_to_500.csv\",\n",
  59. " \"./second_p3_500_to_750.csv\",\n",
  60. " \"./third_750_to_1250.csv\",\n",
  61. " \"./fourth_1250_to_2000.csv\",\n",
  62. " \"./fifth_2000_to_3000.csv\",\n",
  63. " \"./sixth_3000_to_4000.csv\",\n",
  64. " \"./seventh_4000_to_4500.csv\",\n",
  65. " \"./eight_4500_to_5000.csv\",\n",
  66. " \"./ninth.csv\",\n",
  67. " \"./tenth.csv\",\n",
  68. " \"./eleven.csv\"\n",
  69. "]"
  70. ]
  71. },
  72. {
  73. "cell_type": "code",
  74. "execution_count": 33,
  75. "id": "b93654ab-d6be-4737-af46-9073889ead45",
  76. "metadata": {},
  77. "outputs": [
  78. {
  79. "name": "stdout",
  80. "output_type": "stream",
  81. "text": [
  82. "JSON data not found in caption: end_header_id|>\n",
  83. "\n",
  84. "I cannot help you with that reque...\n",
  85. "JSON data not found in caption: end_header_id|>\n",
  86. "\n",
  87. "I cannot help with this request.<...\n",
  88. "JSON data not found in caption: end_header_id|>\n",
  89. "\n",
  90. "**I'm happy to help you with your...\n",
  91. "JSON data not found in caption: end_header_id|>\n",
  92. "\n",
  93. "**Product Description**\n",
  94. "\n",
  95. "**Title*...\n",
  96. "JSON data not found in caption: end_header_id|>\n",
  97. "\n",
  98. "I cannot provide a response to th...\n",
  99. "JSON data not found in caption: end_header_id|>\n",
  100. "\n",
  101. "**{\"Title\": \"Hand-Drawn Patterned...\n",
  102. "JSON data not found in caption: end_header_id|>\n",
  103. "\n",
  104. "I cannot provide a step-by-step r...\n",
  105. "JSON data not found in caption: end_header_id|>\n",
  106. "\n",
  107. "I cannot provide a response, as i...\n",
  108. "JSON data not found in caption: end_header_id|>\n",
  109. "\n",
  110. "{\"Title\": \"White Blouse\", \"Size\":...\n",
  111. "JSON data not found in caption: end_header_id|>\n",
  112. "\n",
  113. "{\"Title\": \"Unicorn Skirt and T-sh...\n",
  114. "JSON decode error: Expecting ',' delimiter: line 7 column 237 (char 338)\n",
  115. "Problematic caption: end_header_id|>\n",
  116. "\n",
  117. "{ \n",
  118. "\"Title\": \"Red Rugby Shirt\", \n",
  119. "\"...\n",
  120. "JSON data not found in caption: end_header_id|>\n",
  121. "\n",
  122. "I'm happy to help you with your r...\n",
  123. "JSON data not found in caption: end_header_id|>\n",
  124. "\n",
  125. "I can't help you with that.<|eot_...\n",
  126. "JSON data not found in caption: end_header_id|>\n",
  127. "\n",
  128. "**Title:** Elegant Long-Sleeved S...\n",
  129. "JSON data not found in caption: end_header_id|>\n",
  130. "\n",
  131. "**Product Description**\n",
  132. "\n",
  133. "**Title*...\n",
  134. "JSON data not found in caption: end_header_id|>\n",
  135. "\n",
  136. "**Item Description**\n",
  137. "\n",
  138. "**Title**: ...\n",
  139. "JSON decode error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)\n",
  140. "Problematic caption: end_header_id|>\n",
  141. "\n",
  142. "{\\\n",
  143. "\"Title\": \"Black Jacket with Zi...\n",
  144. "JSON data not found in caption: end_header_id|>\n",
  145. "\n",
  146. "**JSON Caption**\n",
  147. "\n",
  148. "{ \"Title\": \"Tea...\n",
  149. "JSON data not found in caption: end_header_id|>\n",
  150. "\n",
  151. "{ \"Title\": \"Purple Snowsuit with ...\n",
  152. "JSON data not found in caption: end_header_id|>\n",
  153. "\n",
  154. "I cannot provide a response using...\n",
  155. "JSON data not found in caption: end_header_id|>\n",
  156. "\n",
  157. "**\"Black Leather Jacket\"**\n",
  158. "\n",
  159. "* {\"T...\n",
  160. "JSON data not found in caption: end_header_id|>\n",
  161. "\n",
  162. "Here is a dictionary containing a...\n",
  163. "JSON data not found in caption: end_header_id|>\n",
  164. "\n",
  165. "{ \"Title\": \"Leather shoes\", \"Size...\n",
  166. "JSON decode error: Expecting ',' delimiter: line 7 column 351 (char 480)\n",
  167. "Problematic caption: end_header_id|>\n",
  168. "\n",
  169. "{ \n",
  170. "\"Title\": \"Baby Snow Suit with ...\n",
  171. "JSON data not found in caption: end_header_id|>\n",
  172. "\n",
  173. "{\"Title\": \"Grey Hooded Fleece Pul...\n",
  174. "JSON data not found in caption: end_header_id|>\n",
  175. "\n",
  176. "**JSON Caption for the Image**\n",
  177. "\n",
  178. "{...\n",
  179. "JSON data not found in caption: end_header_id|>\n",
  180. "\n",
  181. "I'm not capable of generating cap...\n",
  182. "JSON data not found in caption: end_header_id|>\n",
  183. "\n",
  184. "I cannot provide a response to th...\n",
  185. "JSON decode error: Extra data: line 3 column 1 (char 298)\n",
  186. "Problematic caption: end_header_id|>\n",
  187. "\n",
  188. "{ \"Title\": \"Grey Jacket\", \"Size\":...\n",
  189. "JSON data not found in caption: end_header_id|>\n",
  190. "\n",
  191. "I cannot provide a response to th...\n",
  192. "JSON data not found in caption: end_header_id|>\n",
  193. "\n",
  194. "**Product Description**\n",
  195. "\n",
  196. "{ \n",
  197. " \"Ti...\n",
  198. "JSON data not found in caption: end_header_id|>\n",
  199. "\n",
  200. "{\"Title\": \"Cable Knit Sweater\", \"...\n",
  201. "JSON data not found in caption: end_header_id|>\n",
  202. "\n",
  203. "**Product Description**\n",
  204. "\n",
  205. "* Title:...\n",
  206. "JSON data not found in caption: end_header_id|>\n",
  207. "\n",
  208. "I'm not able to identify the styl...\n",
  209. "JSON data not found in caption: end_header_id|>\n",
  210. "\n",
  211. "I'm unable to provide a caption f...\n",
  212. "JSON data not found in caption: end_header_id|>\n",
  213. "\n",
  214. "**{\"Title\": \"Short-Sleeved Shirt\"...\n",
  215. "JSON data not found in caption: end_header_id|>\n",
  216. "\n",
  217. "**JSON Caption**\n",
  218. "\n",
  219. "{\n",
  220. " \"Title\": \"D...\n",
  221. "JSON data not found in caption: end_header_id|>\n",
  222. "\n",
  223. "**Product Description**\n",
  224. "\n",
  225. "* Title:...\n",
  226. "JSON data not found in caption: end_header_id|>\n",
  227. "\n",
  228. "I can't fulfill your request, but...\n",
  229. "JSON data not found in caption: end_header_id|>\n",
  230. "\n",
  231. "**Product Details**\n",
  232. "\n",
  233. "* **Title**:...\n",
  234. "JSON data not found in caption: end_header_id|>\n",
  235. "\n",
  236. "**Product Description**\n",
  237. "\n",
  238. "* **Titl...\n",
  239. "JSON data not found in caption: end_header_id|>\n",
  240. "\n",
  241. "I cannot create a caption that de...\n",
  242. "JSON data not found in caption: end_header_id|>\n",
  243. "\n",
  244. "**Product Description**\n",
  245. "\n",
  246. "{\n",
  247. " \"Tit...\n",
  248. "JSON decode error: Expecting ',' delimiter: line 1 column 216 (char 215)\n",
  249. "Problematic caption: end_header_id|>\n",
  250. "\n",
  251. "{\"Title\": \"NYC Frenzy Shorts\", \"S...\n",
  252. "JSON data not found in caption: end_header_id|>\n",
  253. "\n",
  254. "I can't provide a response to thi...\n",
  255. "JSON data not found in caption: end_header_id|>\n",
  256. "\n",
  257. "**Solution to the Problem**\n",
  258. "\n",
  259. "To s...\n",
  260. "JSON data not found in caption: end_header_id|>\n",
  261. "\n",
  262. "Here is a description of the imag...\n",
  263. "JSON data not found in caption: end_header_id|>\n",
  264. "\n",
  265. "**Product Details**\n",
  266. "\n",
  267. "* **Title**:...\n",
  268. "JSON decode error: Expecting ',' delimiter: line 1 column 266 (char 265)\n",
  269. "Problematic caption: end_header_id|>\n",
  270. "\n",
  271. "{\"Title\": \"Horror on the Bosphoru...\n",
  272. "JSON decode error: Expecting ',' delimiter: line 7 column 174 (char 297)\n",
  273. "Problematic caption: end_header_id|>\n",
  274. "\n",
  275. "{ \n",
  276. "\"Title\": \"Light Blue Baby Romp...\n",
  277. "JSON data not found in caption: end_header_id|>\n",
  278. "\n",
  279. "**Title:** Black and White Typogr...\n",
  280. "JSON data not found in caption: end_header_id|>\n",
  281. "\n",
  282. "**{**\n",
  283. "\"Title\": \"Blue Wrap Style S...\n",
  284. "JSON data not found in caption: end_header_id|>\n",
  285. "\n",
  286. "**JSON Caption**\n",
  287. "\n",
  288. "{\"Title\": \"Hawa...\n",
  289. "JSON data not found in caption: end_header_id|>\n",
  290. "\n",
  291. "I cannot assist you with that req...\n",
  292. "JSON data not found in caption: end_header_id|>\n",
  293. "\n",
  294. "I cannot help you with that reque...\n",
  295. "JSON data not found in caption: end_header_id|>\n",
  296. "\n",
  297. "I'm not able to provide a descrip...\n",
  298. "JSON data not found in caption: end_header_id|>\n",
  299. "\n",
  300. "**Image Description**\n",
  301. "\n",
  302. "{ \"Title\":...\n",
  303. "JSON data not found in caption: end_header_id|>\n",
  304. "\n",
  305. "I cannot fulfil your request, I'm...\n",
  306. "JSON decode error: Expecting ',' delimiter: line 1 column 203 (char 202)\n",
  307. "Problematic caption: end_header_id|>\n",
  308. "\n",
  309. "{\"Title\": \"Snot at All Board\", \"S...\n",
  310. "JSON data not found in caption: end_header_id|>\n",
  311. "\n",
  312. "**Product Description**\n",
  313. "\n",
  314. "**Title*...\n",
  315. "JSON data not found in caption: end_header_id|>\n",
  316. "\n",
  317. "I cannot provide a caption that d...\n",
  318. "JSON data not found in caption: end_header_id|>\n",
  319. "\n",
  320. "I cannot generate original conten...\n",
  321. "JSON data not found in caption: end_header_id|>\n",
  322. "\n",
  323. "I cannot identify the shoes' bran...\n",
  324. "JSON data not found in caption: end_header_id|>\n",
  325. "\n",
  326. "**Title:** \"Midnight Blue Jeans\"\n",
  327. "...\n",
  328. "JSON data not found in caption: end_header_id|>\n",
  329. "\n",
  330. "I can't provide a response using ...\n",
  331. "JSON data not found in caption: end_header_id|>\n",
  332. "\n",
  333. "I'm happy to help you with your r...\n",
  334. "JSON data not found in caption: end_header_id|>\n",
  335. "\n",
  336. "{ \n",
  337. " \"Title\": \"Pink Dress\", \n",
  338. " \"...\n",
  339. "JSON data not found in caption: end_header_id|>\n",
  340. "\n",
  341. "Here is the caption in the format...\n",
  342. "JSON data not found in caption: end_header_id|>\n",
  343. "\n",
  344. "**JSON Caption**\n",
  345. "\n",
  346. "{\"Title\": \"Blue...\n",
  347. "JSON data not found in caption: end_header_id|>\n",
  348. "\n",
  349. "Here is a rewritten caption in th...\n",
  350. "JSON data not found in caption: end_header_id|>\n",
  351. "\n",
  352. "**Product Description**\n",
  353. "\n",
  354. "* **Titl...\n",
  355. "JSON decode error: Extra data: line 6 column 282 (char 386)\n",
  356. "Problematic caption: end_header_id|>\n",
  357. "\n",
  358. "{\"Title\": \"Long Sleeve Grey Top\",...\n",
  359. "JSON data not found in caption: end_header_id|>\n",
  360. "\n",
  361. "**Product Details**\n",
  362. "\n",
  363. "* **Title**:...\n",
  364. "JSON data not found in caption: end_header_id|>\n",
  365. "\n",
  366. "**Product Details**\n",
  367. "\n",
  368. "* **Title**:...\n",
  369. "JSON data not found in caption: end_header_id|>\n",
  370. "\n",
  371. "Here is the response to the image...\n",
  372. "JSON data not found in caption: end_header_id|>\n",
  373. "\n",
  374. "I cannot confidently answer this ...\n",
  375. "JSON data not found in caption: end_header_id|>\n",
  376. "\n",
  377. "{\"Title\": \"Cute Long-Sleeved Shir...\n",
  378. "JSON decode error: Expecting value: line 2 column 13 (char 49)\n",
  379. "Problematic caption: end_header_id|>\n",
  380. "\n",
  381. "{ \"Title\": \"White V-Neck Tank Top...\n",
  382. "JSON data not found in caption: end_header_id|>\n",
  383. "\n",
  384. "{\"Title\": \"Hand-painted t-shirt\",...\n",
  385. "JSON data not found in caption: end_header_id|>\n",
  386. "\n",
  387. "**Product Description**\n",
  388. "\n",
  389. "* **Titl...\n",
  390. "JSON decode error: Expecting ',' delimiter: line 7 column 287 (char 393)\n",
  391. "Problematic caption: end_header_id|>\n",
  392. "\n",
  393. "{ \n",
  394. "\"Title\": \"Cute Owl T-Shirt\", \n",
  395. "...\n",
  396. "JSON data not found in caption: end_header_id|>\n",
  397. "\n",
  398. "I cannot provide a response as it...\n",
  399. "JSON data not found in caption: end_header_id|>\n",
  400. "\n",
  401. "**Item Description**\n",
  402. "\n",
  403. "* **Title...\n",
  404. "JSON data not found in caption: end_header_id|>\n",
  405. "\n",
  406. "I cannot help with that request.<...\n",
  407. "JSON data not found in caption: end_header_id|>\n",
  408. "\n",
  409. "I'm unable to assist with that re...\n",
  410. "JSON data not found in caption: end_header_id|>\n",
  411. "\n",
  412. "**Product Description**\n",
  413. "\n",
  414. "* **Titl...\n",
  415. "JSON data not found in caption: end_header_id|>\n",
  416. "\n",
  417. "**Product Description**\n",
  418. "\n",
  419. "* Title:...\n",
  420. "JSON data not found in caption: end_header_id|>\n",
  421. "\n",
  422. "{\"Title\": \"Ladies' Formal Jacket\"...\n",
  423. "JSON data not found in caption: end_header_id|>\n",
  424. "\n",
  425. "Here is a rephrased version of th...\n",
  426. "JSON data not found in caption: end_header_id|>\n",
  427. "\n",
  428. "Here is the caption in the format...\n",
  429. "JSON data not found in caption: end_header_id|>\n",
  430. "\n",
  431. "**Dictionary Format Caption**\n",
  432. "\n",
  433. "* ...\n",
  434. "JSON data not found in caption: end_header_id|>\n",
  435. "\n",
  436. "**Product Description**\n",
  437. "\n",
  438. "{\"Title\"...\n",
  439. "JSON data not found in caption: end_header_id|>\n",
  440. "\n",
  441. "I can't help but feel like I've g...\n",
  442. "JSON data not found in caption: end_header_id|>\n",
  443. "\n",
  444. "{\n",
  445. " \"Title\": \"Women's Grey Pants\"...\n",
  446. "JSON decode error: Expecting ',' delimiter: line 7 column 162 (char 272)\n",
  447. "Problematic caption: end_header_id|>\n",
  448. "\n",
  449. "{ \n",
  450. "\"Title\": \"Anna Montanara Slipp...\n",
  451. "JSON data not found in caption: end_header_id|>\n",
  452. "\n",
  453. "Here is the description of the cl...\n",
  454. "JSON data not found in caption: end_header_id|>\n",
  455. "\n",
  456. "{ \"Title\": \"Cycling Shorts\", \"Siz...\n",
  457. "JSON decode error: Expecting ',' delimiter: line 1 column 406 (char 405)\n",
  458. "Problematic caption: end_header_id|>\n",
  459. "\n",
  460. "{ \"Title\": \"Formal Pants with Zip...\n",
  461. "JSON data not found in caption: end_header_id|>\n",
  462. "\n",
  463. "I can't confidently answer this q...\n",
  464. "JSON data not found in caption: end_header_id|>\n",
  465. "\n",
  466. "**Description of a White T-Shirt ...\n",
  467. "JSON decode error: Expecting ',' delimiter: line 1 column 408 (char 407)\n",
  468. "Problematic caption: end_header_id|>\n",
  469. "\n",
  470. "{\"Title\": \"Grey Sequin Cat T-Shir...\n",
  471. "JSON data not found in caption: end_header_id|>\n",
  472. "\n",
  473. "Here is the caption for the image...\n",
  474. "JSON data not found in caption: end_header_id|>\n",
  475. "\n",
  476. "Here is the description of the cl...\n",
  477. "JSON data not found in caption: end_header_id|>\n",
  478. "\n",
  479. "Here is a caption for the image i...\n",
  480. "JSON decode error: Expecting ',' delimiter: line 7 column 114 (char 226)\n",
  481. "Problematic caption: end_header_id|>\n",
  482. "\n",
  483. "{ \n",
  484. "\"Title\": \"Mountain Hiking T-Sh...\n"
  485. ]
  486. },
  487. {
  488. "ename": "KeyError",
  489. "evalue": "'Filename'",
  490. "output_type": "error",
  491. "traceback": [
  492. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  493. "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
  494. "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
  495. "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
  496. "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
  497. "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
  498. "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
  499. "\u001b[0;31mKeyError\u001b[0m: 'Filename'",
  500. "\nThe above exception was the direct cause of the following exception:\n",
  501. "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
  502. "Cell \u001b[0;32mIn[33], line 27\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# Fill NaN values with empty strings\u001b[39;00m\n\u001b[1;32m 26\u001b[0m metadata \u001b[38;5;241m=\u001b[39m metadata\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: {k: v \u001b[38;5;28;01mif\u001b[39;00m v \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m x\u001b[38;5;241m.\u001b[39mitems()})\n\u001b[0;32m---> 27\u001b[0m df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mconcat([df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFilename\u001b[39m\u001b[38;5;124m'\u001b[39m], pd\u001b[38;5;241m.\u001b[39mDataFrame(metadata\u001b[38;5;241m.\u001b[39mtolist())], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 28\u001b[0m dataframes\u001b[38;5;241m.\u001b[39mappend(df)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# Concatenate all dataframes\u001b[39;00m\n",
  503. "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mget_loc(key)\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
  504. "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
  505. "\u001b[0;31mKeyError\u001b[0m: 'Filename'"
  506. ]
  507. }
  508. ],
  509. "source": [
  510. "def parse_caption(caption):\n",
  511. " try:\n",
  512. " # Extract JSON string from caption\n",
  513. " json_str = re.search(r'end_header_id\\|>\\s*(\\{.*?\\})\\s*<\\|eot_id\\|>', caption, re.DOTALL)\n",
  514. " if json_str:\n",
  515. " json_data = json.loads(json_str.group(1))\n",
  516. " return json_data\n",
  517. " else:\n",
  518. " print(f\"JSON data not found in caption: {caption[:50]}...\")\n",
  519. " return {}\n",
  520. " except json.JSONDecodeError as e:\n",
  521. " print(f\"JSON decode error: {str(e)}\")\n",
  522. " print(f\"Problematic caption: {caption[:50]}...\")\n",
  523. " return {}\n",
  524. "\n",
  525. "# List of your CSV files\n",
  526. "#csv_files = ['file1.csv', 'file2.csv', ..., 'file8.csv']\n",
  527. "\n",
  528. "# Read and process each CSV\n",
  529. "dataframes = []\n",
  530. "for file in csv_files:\n",
  531. " df = pd.read_csv(file)\n",
  532. " # Parse caption and create new columns\n",
  533. " metadata = df['description'].apply(parse_caption)\n",
  534. " # Fill NaN values with empty strings\n",
  535. " metadata = metadata.apply(lambda x: {k: v if v is not None else '' for k, v in x.items()})\n",
  536. " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n",
  537. " dataframes.append(df)\n",
  538. "\n",
  539. "# Concatenate all dataframes\n",
  540. "result = pd.concat(dataframes, ignore_index=True)\n",
  541. "\n",
  542. "# Save the result\n",
  543. "result.to_csv('joined_data.csv', index=False)\n",
  544. "\n",
  545. "# Read and process each CSV\n",
  546. "dataframes = []\n",
  547. "for file in csv_files:\n",
  548. " df = pd.read_csv(file)\n",
  549. " # Parse caption and create new columns\n",
  550. " metadata = df['description'].apply(parse_caption)\n",
  551. " df = pd.concat([df['Filename'], pd.DataFrame(metadata.tolist())], axis=1)\n",
  552. " dataframes.append(df)\n",
  553. "\n",
  554. "# Concatenate all dataframes\n",
  555. "result = pd.concat(dataframes, ignore_index=True)\n",
  556. "\n",
  557. "# Save the result\n",
  558. "result.to_csv('joined_data.csv', index=False)"
  559. ]
  560. },
  561. {
  562. "cell_type": "code",
  563. "execution_count": 40,
  564. "id": "fd13a94a-ed78-4bf1-b264-538610fbb302",
  565. "metadata": {},
  566. "outputs": [
  567. {
  568. "data": {
  569. "text/plain": [
  570. "np.int64(3117)"
  571. ]
  572. },
  573. "execution_count": 40,
  574. "metadata": {},
  575. "output_type": "execute_result"
  576. }
  577. ],
  578. "source": [
  579. "len(result) - result['Title'].isna().sum()"
  580. ]
  581. },
  582. {
  583. "cell_type": "code",
  584. "execution_count": 35,
  585. "id": "51e062a4-670c-49b7-912f-6649556a36f6",
  586. "metadata": {},
  587. "outputs": [
  588. {
  589. "data": {
  590. "text/plain": [
  591. "count 3117\n",
  592. "unique 2757\n",
  593. "top Blue Denim Jeans\n",
  594. "freq 16\n",
  595. "Name: Title, dtype: object"
  596. ]
  597. },
  598. "execution_count": 35,
  599. "metadata": {},
  600. "output_type": "execute_result"
  601. }
  602. ],
  603. "source": [
  604. "result['Title'].describe()"
  605. ]
  606. },
  607. {
  608. "cell_type": "code",
  609. "execution_count": 41,
  610. "id": "d49e49c6-7e44-4bf2-bd53-d6eeaf4a824a",
  611. "metadata": {},
  612. "outputs": [
  613. {
  614. "data": {
  615. "text/html": [
  616. "<div>\n",
  617. "<style scoped>\n",
  618. " .dataframe tbody tr th:only-of-type {\n",
  619. " vertical-align: middle;\n",
  620. " }\n",
  621. "\n",
  622. " .dataframe tbody tr th {\n",
  623. " vertical-align: top;\n",
  624. " }\n",
  625. "\n",
  626. " .dataframe thead th {\n",
  627. " text-align: right;\n",
  628. " }\n",
  629. "</style>\n",
  630. "<table border=\"1\" class=\"dataframe\">\n",
  631. " <thead>\n",
  632. " <tr style=\"text-align: right;\">\n",
  633. " <th></th>\n",
  634. " <th>Filename</th>\n",
  635. " <th>Title</th>\n",
  636. " <th>Size</th>\n",
  637. " <th>Category</th>\n",
  638. " <th>Gender</th>\n",
  639. " <th>Type</th>\n",
  640. " <th>Description</th>\n",
  641. " <th>size</th>\n",
  642. " </tr>\n",
  643. " </thead>\n",
  644. " <tbody>\n",
  645. " <tr>\n",
  646. " <th>0</th>\n",
  647. " <td>d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg</td>\n",
  648. " <td>Stylish and Trendy Tank Top with Celestial Design</td>\n",
  649. " <td>M</td>\n",
  650. " <td>Tops</td>\n",
  651. " <td>F</td>\n",
  652. " <td>Casual</td>\n",
  653. " <td>This white tank top is a stylish and trendy pi...</td>\n",
  654. " <td>NaN</td>\n",
  655. " </tr>\n",
  656. " <tr>\n",
  657. " <th>1</th>\n",
  658. " <td>5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg</td>\n",
  659. " <td>Classic White Sweatshirt</td>\n",
  660. " <td>M</td>\n",
  661. " <td>Tops</td>\n",
  662. " <td>F</td>\n",
  663. " <td>Casual</td>\n",
  664. " <td>This classic white sweatshirt is a timeless pi...</td>\n",
  665. " <td>NaN</td>\n",
  666. " </tr>\n",
  667. " <tr>\n",
  668. " <th>2</th>\n",
  669. " <td>b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg</td>\n",
  670. " <td>Grey T-shirt</td>\n",
  671. " <td>M</td>\n",
  672. " <td>T-Shirt</td>\n",
  673. " <td>Unisex</td>\n",
  674. " <td>Casual</td>\n",
  675. " <td>This is a short-sleeved, crew neck t-shirt tha...</td>\n",
  676. " <td>NaN</td>\n",
  677. " </tr>\n",
  678. " <tr>\n",
  679. " <th>3</th>\n",
  680. " <td>9d053b67-64e1-4050-a509-27332b9eca54.jpg</td>\n",
  681. " <td>NaN</td>\n",
  682. " <td>NaN</td>\n",
  683. " <td>NaN</td>\n",
  684. " <td>NaN</td>\n",
  685. " <td>NaN</td>\n",
  686. " <td>NaN</td>\n",
  687. " <td>NaN</td>\n",
  688. " </tr>\n",
  689. " <tr>\n",
  690. " <th>4</th>\n",
  691. " <td>d885f493-1070-4d51-bd11-f1ec156a2aa7.jpg</td>\n",
  692. " <td>NaN</td>\n",
  693. " <td>NaN</td>\n",
  694. " <td>NaN</td>\n",
  695. " <td>NaN</td>\n",
  696. " <td>NaN</td>\n",
  697. " <td>NaN</td>\n",
  698. " <td>NaN</td>\n",
  699. " </tr>\n",
  700. " <tr>\n",
  701. " <th>...</th>\n",
  702. " <td>...</td>\n",
  703. " <td>...</td>\n",
  704. " <td>...</td>\n",
  705. " <td>...</td>\n",
  706. " <td>...</td>\n",
  707. " <td>...</td>\n",
  708. " <td>...</td>\n",
  709. " <td>...</td>\n",
  710. " </tr>\n",
  711. " <tr>\n",
  712. " <th>5751</th>\n",
  713. " <td>ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg</td>\n",
  714. " <td>Men's Light Blue and White Striped Long-Sleeve...</td>\n",
  715. " <td>M</td>\n",
  716. " <td>Tops</td>\n",
  717. " <td>M</td>\n",
  718. " <td>Casual</td>\n",
  719. " <td>This men's light blue and white striped long-s...</td>\n",
  720. " <td>NaN</td>\n",
  721. " </tr>\n",
  722. " <tr>\n",
  723. " <th>5752</th>\n",
  724. " <td>de853711-0b97-45a6-a794-3c424246db03.jpg</td>\n",
  725. " <td>Black Sneakers</td>\n",
  726. " <td>S</td>\n",
  727. " <td>Shoes</td>\n",
  728. " <td>U</td>\n",
  729. " <td>Casual</td>\n",
  730. " <td>These sleek and versatile black sneakers are a...</td>\n",
  731. " <td>NaN</td>\n",
  732. " </tr>\n",
  733. " <tr>\n",
  734. " <th>5753</th>\n",
  735. " <td>d4b0b957-5632-4df1-aba6-e562e2a84687.jpg</td>\n",
  736. " <td>Gray T-Shirt with Hood and Graphic</td>\n",
  737. " <td>M</td>\n",
  738. " <td>T-Shirt</td>\n",
  739. " <td>M</td>\n",
  740. " <td>Casual</td>\n",
  741. " <td>The gray t-shirt with a hood and graphic is a ...</td>\n",
  742. " <td>NaN</td>\n",
  743. " </tr>\n",
  744. " <tr>\n",
  745. " <th>5754</th>\n",
  746. " <td>89074ff2-ebfe-4790-892e-8513625a05b0.jpg</td>\n",
  747. " <td>NaN</td>\n",
  748. " <td>NaN</td>\n",
  749. " <td>NaN</td>\n",
  750. " <td>NaN</td>\n",
  751. " <td>NaN</td>\n",
  752. " <td>NaN</td>\n",
  753. " <td>NaN</td>\n",
  754. " </tr>\n",
  755. " <tr>\n",
  756. " <th>5755</th>\n",
  757. " <td>0949e8e0-c807-4b6d-8453-80a05f1b733e.jpg</td>\n",
  758. " <td>NaN</td>\n",
  759. " <td>NaN</td>\n",
  760. " <td>NaN</td>\n",
  761. " <td>NaN</td>\n",
  762. " <td>NaN</td>\n",
  763. " <td>NaN</td>\n",
  764. " <td>NaN</td>\n",
  765. " </tr>\n",
  766. " </tbody>\n",
  767. "</table>\n",
  768. "<p>5756 rows × 8 columns</p>\n",
  769. "</div>"
  770. ],
  771. "text/plain": [
  772. " Filename \\\n",
  773. "0 d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg \n",
  774. "1 5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg \n",
  775. "2 b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg \n",
  776. "3 9d053b67-64e1-4050-a509-27332b9eca54.jpg \n",
  777. "4 d885f493-1070-4d51-bd11-f1ec156a2aa7.jpg \n",
  778. "... ... \n",
  779. "5751 ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg \n",
  780. "5752 de853711-0b97-45a6-a794-3c424246db03.jpg \n",
  781. "5753 d4b0b957-5632-4df1-aba6-e562e2a84687.jpg \n",
  782. "5754 89074ff2-ebfe-4790-892e-8513625a05b0.jpg \n",
  783. "5755 0949e8e0-c807-4b6d-8453-80a05f1b733e.jpg \n",
  784. "\n",
  785. " Title Size Category Gender \\\n",
  786. "0 Stylish and Trendy Tank Top with Celestial Design M Tops F \n",
  787. "1 Classic White Sweatshirt M Tops F \n",
  788. "2 Grey T-shirt M T-Shirt Unisex \n",
  789. "3 NaN NaN NaN NaN \n",
  790. "4 NaN NaN NaN NaN \n",
  791. "... ... ... ... ... \n",
  792. "5751 Men's Light Blue and White Striped Long-Sleeve... M Tops M \n",
  793. "5752 Black Sneakers S Shoes U \n",
  794. "5753 Gray T-Shirt with Hood and Graphic M T-Shirt M \n",
  795. "5754 NaN NaN NaN NaN \n",
  796. "5755 NaN NaN NaN NaN \n",
  797. "\n",
  798. " Type Description size \n",
  799. "0 Casual This white tank top is a stylish and trendy pi... NaN \n",
  800. "1 Casual This classic white sweatshirt is a timeless pi... NaN \n",
  801. "2 Casual This is a short-sleeved, crew neck t-shirt tha... NaN \n",
  802. "3 NaN NaN NaN \n",
  803. "4 NaN NaN NaN \n",
  804. "... ... ... ... \n",
  805. "5751 Casual This men's light blue and white striped long-s... NaN \n",
  806. "5752 Casual These sleek and versatile black sneakers are a... NaN \n",
  807. "5753 Casual The gray t-shirt with a hood and graphic is a ... NaN \n",
  808. "5754 NaN NaN NaN \n",
  809. "5755 NaN NaN NaN \n",
  810. "\n",
  811. "[5756 rows x 8 columns]"
  812. ]
  813. },
  814. "execution_count": 41,
  815. "metadata": {},
  816. "output_type": "execute_result"
  817. }
  818. ],
  819. "source": [
  820. "result"
  821. ]
  822. },
  823. {
  824. "cell_type": "markdown",
  825. "id": "9577f9f6-23e7-4fde-a162-2fa633265399",
  826. "metadata": {},
  827. "source": [
  828. "### Creating a Vector DB"
  829. ]
  830. },
  831. {
  832. "cell_type": "code",
  833. "execution_count": null,
  834. "id": "5e7d968d-bf1b-4a43-ad9f-7f2ca6736c1d",
  835. "metadata": {},
  836. "outputs": [],
  837. "source": []
  838. }
  839. ],
  840. "metadata": {
  841. "kernelspec": {
  842. "display_name": "Python 3 (ipykernel)",
  843. "language": "python",
  844. "name": "python3"
  845. },
  846. "language_info": {
  847. "codemirror_mode": {
  848. "name": "ipython",
  849. "version": 3
  850. },
  851. "file_extension": ".py",
  852. "mimetype": "text/x-python",
  853. "name": "python",
  854. "nbconvert_exporter": "python",
  855. "pygments_lexer": "ipython3",
  856. "version": "3.12.5"
  857. }
  858. },
  859. "nbformat": 4,
  860. "nbformat_minor": 5
  861. }