|
@@ -5,7 +5,16 @@
|
|
|
"id": "98cc49e3-6669-4a7a-be02-a2025d397a4c",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
- "## Cleaning up the Annotations and Creating Vector DB"
|
|
|
+ "## Cleaning up the Annotations and Creating Vector DB\n",
|
|
|
+ "\n",
|
|
|
+ "This notebook 2 in the workshop/course series. Like most readers, you can skip the recap but here it is regardless-so far:\n",
|
|
|
+ "\n",
|
|
|
+ "- We used a dataset of 5000 images with some meta-data\n",
|
|
|
+ "- Cleaned up corrupt images\n",
|
|
|
+ "- Pre-processed categories to reduce complexity\n",
|
|
|
+ "- Balanced categories by random sampling\n",
|
|
|
+ "- Iterated and prompted 11B to label images\n",
|
|
|
+ "- Created Script to label images"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -13,7 +22,8 @@
|
|
|
"id": "6c6b84dd-ac69-49b5-9f4b-3c22d60c585c",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
- "### Cleaning up Annotations"
|
|
|
+ "### Cleaning up Annotations\n",
|
|
|
+ "\n"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -41,7 +51,9 @@
|
|
|
"import pandas as pd\n",
|
|
|
"import numpy as np\n",
|
|
|
"import json\n",
|
|
|
- "import re"
|
|
|
+ "import re\n",
|
|
|
+ "import matplotlib.pyplot as plt\n",
|
|
|
+ "import seaborn as sns"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
@@ -522,9 +534,6 @@
|
|
|
" print(f\"Problematic caption: {caption[:50]}...\")\n",
|
|
|
" return {}\n",
|
|
|
"\n",
|
|
|
- "# List of your CSV files\n",
|
|
|
- "#csv_files = ['file1.csv', 'file2.csv', ..., 'file8.csv']\n",
|
|
|
- "\n",
|
|
|
"# Read and process each CSV\n",
|
|
|
"dataframes = []\n",
|
|
|
"for file in csv_files:\n",
|
|
@@ -1252,17 +1261,6 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 70,
|
|
|
- "id": "d3837323-a815-4337-b5e2-24322fec6b08",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "import matplotlib.pyplot as plt\n",
|
|
|
- "import seaborn as sns"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
"execution_count": 73,
|
|
|
"id": "f8476f83-a0ec-408d-a471-5bab4e4e330b",
|
|
|
"metadata": {},
|
|
@@ -1708,673 +1706,14 @@
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
- "cell_type": "markdown",
|
|
|
- "id": "9577f9f6-23e7-4fde-a162-2fa633265399",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "### Creating a Vector DB"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 86,
|
|
|
- "id": "5e7d968d-bf1b-4a43-ad9f-7f2ca6736c1d",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "#!pip install lancedb rerankers\n",
|
|
|
- "#!pip install sentence-transformers"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 1,
|
|
|
- "id": "a0db3c93-a0f2-4f49-908a-181b63b5847e",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "a176716866754071b3f28533a0b5db10",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "ac21a9a40c6f410e84bd46137de72cf2",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "config_sentence_transformers.json: 0%| | 0.00/124 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "33a4b93a6e4d4ed2a6212b9bb3177f97",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "README.md: 0%| | 0.00/94.8k [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "1884e2f091dc42e8bc0199f16542bfd9",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "sentence_bert_config.json: 0%| | 0.00/52.0 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "f9dfd32a6a964b33bfb7dcb2ff8a5770",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "config.json: 0%| | 0.00/743 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "2f16f57b26544e4d9c7aabfc9e6ec7e3",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "model.safetensors: 0%| | 0.00/133M [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "50deb4679796404d8ea10c53c6da2d92",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "tokenizer_config.json: 0%| | 0.00/366 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "2c8ccaf41b4e4fe8a8dbfa3f90fd760c",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "1bf4bcb6c2c5482d827f72131f6719ce",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "tokenizer.json: 0%| | 0.00/711k [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "25402e266bf04a138f31c8dfea9e3d16",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "special_tokens_map.json: 0%| | 0.00/125 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "application/vnd.jupyter.widget-view+json": {
|
|
|
- "model_id": "ae377bce030646a2a5823c9d1bbaf481",
|
|
|
- "version_major": 2,
|
|
|
- "version_minor": 0
|
|
|
- },
|
|
|
- "text/plain": [
|
|
|
- "1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "metadata": {},
|
|
|
- "output_type": "display_data"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "stderr",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "[2024-10-02T21:22:52Z WARN lance::dataset] No existing dataset at /home/sanyambhutani/.lancedb/clothes.lance, it will be created\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "import lancedb\n",
|
|
|
- "from lancedb.pydantic import LanceModel, Vector\n",
|
|
|
- "from lancedb.embeddings import get_registry\n",
|
|
|
- "from lancedb.rerankers import ColbertReranker\n",
|
|
|
- "\n",
|
|
|
- "model = get_registry().get(\"sentence-transformers\").create(name=\"BAAI/bge-small-en-v1.5\", device=\"cuda\")\n",
|
|
|
- "\n",
|
|
|
- "\n",
|
|
|
- "class Schema(LanceModel):\n",
|
|
|
- " Filename: str\n",
|
|
|
- " Title: str\n",
|
|
|
- " Size: str\n",
|
|
|
- " Gender: str\n",
|
|
|
- " Description: str = model.SourceField()\n",
|
|
|
- " vector: Vector(model.ndims()) = model.VectorField()\n",
|
|
|
- " Category: str\n",
|
|
|
- " Type: str\n",
|
|
|
- " \n",
|
|
|
- "db = lancedb.connect(\"~/.lancedb\")\n",
|
|
|
- "tbl = db.create_table(name=\"clothes\", schema=Schema, mode=\"overwrite\")"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 8,
|
|
|
- "id": "a3341568-d835-4c80-8e90-de651657bcca",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "df = pd.read_csv(\"./final_balanced_sample_dataset.csv\")"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 9,
|
|
|
- "id": "47976e72-6093-4314-a7d9-fedba7316e56",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/html": [
|
|
|
- "<div>\n",
|
|
|
- "<style scoped>\n",
|
|
|
- " .dataframe tbody tr th:only-of-type {\n",
|
|
|
- " vertical-align: middle;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe tbody tr th {\n",
|
|
|
- " vertical-align: top;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe thead th {\n",
|
|
|
- " text-align: right;\n",
|
|
|
- " }\n",
|
|
|
- "</style>\n",
|
|
|
- "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
- " <thead>\n",
|
|
|
- " <tr style=\"text-align: right;\">\n",
|
|
|
- " <th></th>\n",
|
|
|
- " <th>Filename</th>\n",
|
|
|
- " <th>Title</th>\n",
|
|
|
- " <th>Size</th>\n",
|
|
|
- " <th>Gender</th>\n",
|
|
|
- " <th>Description</th>\n",
|
|
|
- " <th>Category</th>\n",
|
|
|
- " <th>Type</th>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </thead>\n",
|
|
|
- " <tbody>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>0</th>\n",
|
|
|
- " <td>d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg</td>\n",
|
|
|
- " <td>Stylish and Trendy Tank Top with Celestial Design</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>F</td>\n",
|
|
|
- " <td>This white tank top is a stylish and trendy pi...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>1</th>\n",
|
|
|
- " <td>5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg</td>\n",
|
|
|
- " <td>Classic White Sweatshirt</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>F</td>\n",
|
|
|
- " <td>This classic white sweatshirt is a timeless pi...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>2</th>\n",
|
|
|
- " <td>b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg</td>\n",
|
|
|
- " <td>Grey T-shirt</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>Unisex</td>\n",
|
|
|
- " <td>This is a short-sleeved, crew neck t-shirt tha...</td>\n",
|
|
|
- " <td>T-Shirt</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3</th>\n",
|
|
|
- " <td>87846aa9-86cc-404a-af2c-7e8fe941081d.jpg</td>\n",
|
|
|
- " <td>Long-Sleeved V-Neck Shirt</td>\n",
|
|
|
- " <td>L</td>\n",
|
|
|
- " <td>U</td>\n",
|
|
|
- " <td>A long-sleeved, V-neck shirt with a solid purp...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>4</th>\n",
|
|
|
- " <td>04fa06fb-d71a-4293-9804-fe799375a682.jpg</td>\n",
|
|
|
- " <td>Silver Metallic Buckle Sandals</td>\n",
|
|
|
- " <td>L</td>\n",
|
|
|
- " <td>F</td>\n",
|
|
|
- " <td>These silver metallic buckle sandals feature a...</td>\n",
|
|
|
- " <td>Shoes</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>...</th>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " <td>...</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3112</th>\n",
|
|
|
- " <td>c1fafe22-a65b-4ce4-9383-dbd470a205e6.jpg</td>\n",
|
|
|
- " <td>Pink Bird Printed Long Sleeved T-Shirt</td>\n",
|
|
|
- " <td>L</td>\n",
|
|
|
- " <td>F</td>\n",
|
|
|
- " <td>A long-sleeved t-shirt with a crew neck and pi...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3113</th>\n",
|
|
|
- " <td>4cc0a79e-aa26-4afc-aabc-5612f8515bf8.jpg</td>\n",
|
|
|
- " <td>Blue and Gold Top</td>\n",
|
|
|
- " <td>L</td>\n",
|
|
|
- " <td>F</td>\n",
|
|
|
- " <td>This sleeveless top features a beautiful blue ...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3114</th>\n",
|
|
|
- " <td>ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg</td>\n",
|
|
|
- " <td>Men's Light Blue and White Striped Long-Sleeve...</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>This men's light blue and white striped long-s...</td>\n",
|
|
|
- " <td>Tops</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3115</th>\n",
|
|
|
- " <td>de853711-0b97-45a6-a794-3c424246db03.jpg</td>\n",
|
|
|
- " <td>Black Sneakers</td>\n",
|
|
|
- " <td>S</td>\n",
|
|
|
- " <td>U</td>\n",
|
|
|
- " <td>These sleek and versatile black sneakers are a...</td>\n",
|
|
|
- " <td>Shoes</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3116</th>\n",
|
|
|
- " <td>d4b0b957-5632-4df1-aba6-e562e2a84687.jpg</td>\n",
|
|
|
- " <td>Gray T-Shirt with Hood and Graphic</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>M</td>\n",
|
|
|
- " <td>The gray t-shirt with a hood and graphic is a ...</td>\n",
|
|
|
- " <td>T-Shirt</td>\n",
|
|
|
- " <td>Casual</td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </tbody>\n",
|
|
|
- "</table>\n",
|
|
|
- "<p>3117 rows × 7 columns</p>\n",
|
|
|
- "</div>"
|
|
|
- ],
|
|
|
- "text/plain": [
|
|
|
- " Filename \\\n",
|
|
|
- "0 d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg \n",
|
|
|
- "1 5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg \n",
|
|
|
- "2 b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg \n",
|
|
|
- "3 87846aa9-86cc-404a-af2c-7e8fe941081d.jpg \n",
|
|
|
- "4 04fa06fb-d71a-4293-9804-fe799375a682.jpg \n",
|
|
|
- "... ... \n",
|
|
|
- "3112 c1fafe22-a65b-4ce4-9383-dbd470a205e6.jpg \n",
|
|
|
- "3113 4cc0a79e-aa26-4afc-aabc-5612f8515bf8.jpg \n",
|
|
|
- "3114 ae9cec7a-dd1d-49bc-adae-6446429c03d8.jpg \n",
|
|
|
- "3115 de853711-0b97-45a6-a794-3c424246db03.jpg \n",
|
|
|
- "3116 d4b0b957-5632-4df1-aba6-e562e2a84687.jpg \n",
|
|
|
- "\n",
|
|
|
- " Title Size Gender \\\n",
|
|
|
- "0 Stylish and Trendy Tank Top with Celestial Design M F \n",
|
|
|
- "1 Classic White Sweatshirt M F \n",
|
|
|
- "2 Grey T-shirt M Unisex \n",
|
|
|
- "3 Long-Sleeved V-Neck Shirt L U \n",
|
|
|
- "4 Silver Metallic Buckle Sandals L F \n",
|
|
|
- "... ... ... ... \n",
|
|
|
- "3112 Pink Bird Printed Long Sleeved T-Shirt L F \n",
|
|
|
- "3113 Blue and Gold Top L F \n",
|
|
|
- "3114 Men's Light Blue and White Striped Long-Sleeve... M M \n",
|
|
|
- "3115 Black Sneakers S U \n",
|
|
|
- "3116 Gray T-Shirt with Hood and Graphic M M \n",
|
|
|
- "\n",
|
|
|
- " Description Category Type \n",
|
|
|
- "0 This white tank top is a stylish and trendy pi... Tops Casual \n",
|
|
|
- "1 This classic white sweatshirt is a timeless pi... Tops Casual \n",
|
|
|
- "2 This is a short-sleeved, crew neck t-shirt tha... T-Shirt Casual \n",
|
|
|
- "3 A long-sleeved, V-neck shirt with a solid purp... Tops Casual \n",
|
|
|
- "4 These silver metallic buckle sandals feature a... Shoes Casual \n",
|
|
|
- "... ... ... ... \n",
|
|
|
- "3112 A long-sleeved t-shirt with a crew neck and pi... Tops Casual \n",
|
|
|
- "3113 This sleeveless top features a beautiful blue ... Tops Casual \n",
|
|
|
- "3114 This men's light blue and white striped long-s... Tops Casual \n",
|
|
|
- "3115 These sleek and versatile black sneakers are a... Shoes Casual \n",
|
|
|
- "3116 The gray t-shirt with a hood and graphic is a ... T-Shirt Casual \n",
|
|
|
- "\n",
|
|
|
- "[3117 rows x 7 columns]"
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 9,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "df"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 10,
|
|
|
- "id": "9470c102-781d-4888-a373-efc184115cc8",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "def fix_unescaped_quotes(json_string):\n",
|
|
|
- " # Find the \"Description\" field and its content\n",
|
|
|
- " pattern = r'\"Description\"\\s*:\\s*\"(.*?)\"(?=\\s*[,}])'\n",
|
|
|
- " \n",
|
|
|
- " def escape_quotes(match):\n",
|
|
|
- " # Escape any unescaped quotes in the description content\n",
|
|
|
- " content = match.group(1)\n",
|
|
|
- " escaped_content = re.sub(r'(?<!\\\\)\"', r'\\\"', content)\n",
|
|
|
- " return f'\"Description\":\"{escaped_content}\"'\n",
|
|
|
- " \n",
|
|
|
- " # Replace the Description field with properly escaped content\n",
|
|
|
- " fixed_json = re.sub(pattern, escape_quotes, json_string)\n",
|
|
|
- " # Now we can safely parse the JSON\n",
|
|
|
- " try:\n",
|
|
|
- " fixed_json = \"{\" + fixed_json.split(\"{\")[1].split(\"}\")[0] + \"}\"\n",
|
|
|
- " return json.loads(fixed_json)\n",
|
|
|
- " except:\n",
|
|
|
- " return {\n",
|
|
|
- " 'Title': \"\", 'Size': \"\" , 'Category': \"\" , 'Gender': \"\" , 'Type': \"\" , 'Description': \"\"\n",
|
|
|
- " }"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 13,
|
|
|
- "id": "3c9ce7f4-9c8d-4631-88b8-796a9c97ffdd",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "data = [{\"Filename\": row[\"Filename\"],**fix_unescaped_quotes(row[\"Description\"])} for index, row in df.iterrows()]"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 14,
|
|
|
- "id": "8b0402c2-ee14-4bb3-bc0b-1854a6aac72a",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "# automatically generate vectors\n",
|
|
|
- "tbl.add(data)"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 15,
|
|
|
- "id": "7a278c8f-17de-47a7-9f90-a8d68ea4e6aa",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "data": {
|
|
|
- "text/html": [
|
|
|
- "<div>\n",
|
|
|
- "<style scoped>\n",
|
|
|
- " .dataframe tbody tr th:only-of-type {\n",
|
|
|
- " vertical-align: middle;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe tbody tr th {\n",
|
|
|
- " vertical-align: top;\n",
|
|
|
- " }\n",
|
|
|
- "\n",
|
|
|
- " .dataframe thead th {\n",
|
|
|
- " text-align: right;\n",
|
|
|
- " }\n",
|
|
|
- "</style>\n",
|
|
|
- "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
- " <thead>\n",
|
|
|
- " <tr style=\"text-align: right;\">\n",
|
|
|
- " <th></th>\n",
|
|
|
- " <th>Filename</th>\n",
|
|
|
- " <th>Title</th>\n",
|
|
|
- " <th>Size</th>\n",
|
|
|
- " <th>Gender</th>\n",
|
|
|
- " <th>Description</th>\n",
|
|
|
- " <th>vector</th>\n",
|
|
|
- " <th>Category</th>\n",
|
|
|
- " <th>Type</th>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </thead>\n",
|
|
|
- " <tbody>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>0</th>\n",
|
|
|
- " <td>d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>1</th>\n",
|
|
|
- " <td>5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>2</th>\n",
|
|
|
- " <td>b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>3</th>\n",
|
|
|
- " <td>87846aa9-86cc-404a-af2c-7e8fe941081d.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>4</th>\n",
|
|
|
- " <td>04fa06fb-d71a-4293-9804-fe799375a682.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>5</th>\n",
|
|
|
- " <td>8f576f1a-839d-4fb2-a224-a4700b2d05da.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>6</th>\n",
|
|
|
- " <td>e976a8f6-6731-485f-8a9a-2872a5208818.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>7</th>\n",
|
|
|
- " <td>bbf0d9c7-663d-46d1-a9f8-66e8e5678541.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>8</th>\n",
|
|
|
- " <td>e25a7faa-7a49-4e72-a7ef-e74427f77784.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " <tr>\n",
|
|
|
- " <th>9</th>\n",
|
|
|
- " <td>d995ac1f-fbd0-482c-a308-dafb6a93cfd0.jpg</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td>[0.04846169, -0.0012961391, 0.016879003, -0.04...</td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " <td></td>\n",
|
|
|
- " </tr>\n",
|
|
|
- " </tbody>\n",
|
|
|
- "</table>\n",
|
|
|
- "</div>"
|
|
|
- ],
|
|
|
- "text/plain": [
|
|
|
- " Filename Title Size Gender Description \\\n",
|
|
|
- "0 d7ed1d64-2c65-427f-9ae4-eb4aaa3e2389.jpg \n",
|
|
|
- "1 5c1b7a77-1fa3-4af8-9722-cd38e45d89da.jpg \n",
|
|
|
- "2 b2e084c7-e3a0-4182-8671-b908544a7cf2.jpg \n",
|
|
|
- "3 87846aa9-86cc-404a-af2c-7e8fe941081d.jpg \n",
|
|
|
- "4 04fa06fb-d71a-4293-9804-fe799375a682.jpg \n",
|
|
|
- "5 8f576f1a-839d-4fb2-a224-a4700b2d05da.jpg \n",
|
|
|
- "6 e976a8f6-6731-485f-8a9a-2872a5208818.jpg \n",
|
|
|
- "7 bbf0d9c7-663d-46d1-a9f8-66e8e5678541.jpg \n",
|
|
|
- "8 e25a7faa-7a49-4e72-a7ef-e74427f77784.jpg \n",
|
|
|
- "9 d995ac1f-fbd0-482c-a308-dafb6a93cfd0.jpg \n",
|
|
|
- "\n",
|
|
|
- " vector Category Type \n",
|
|
|
- "0 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "1 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "2 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "3 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "4 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "5 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "6 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "7 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "8 [0.04846169, -0.0012961391, 0.016879003, -0.04... \n",
|
|
|
- "9 [0.04846169, -0.0012961391, 0.016879003, -0.04... "
|
|
|
- ]
|
|
|
- },
|
|
|
- "execution_count": 15,
|
|
|
- "metadata": {},
|
|
|
- "output_type": "execute_result"
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "tbl.search().to_pandas()"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"id": "ee854540-3908-4428-a063-72c8997a2540",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
- "source": []
|
|
|
+ "source": [
|
|
|
+ "#fin"
|
|
|
+ ]
|
|
|
}
|
|
|
],
|
|
|
"metadata": {
|