| 
					
				 | 
			
			
				@@ -43,6 +43,16 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "7b23d509", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Assuming you have a PDF uploaded on the same machine, please set the path for the file. \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Also, if you want to flex your GPU-please switch to a bigger model although the featherlight models work perfectly for this task:" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 14, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "60d0061b-8b8c-4353-850f-f19466a0ae2d", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -60,7 +70,6 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "# Import necessary libraries\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "import PyPDF2\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "from typing import Optional\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "import os\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -75,6 +84,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "203c22eb", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Let's make sure we don't stub our toe by checking if the file exists" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 9, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "153d9ece-37a4-4fff-a8e8-53f923a2b0a0", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -92,6 +109,16 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "5a362ac3", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Convert PDF to a `.txt` file. This would simply read and dump the contents of the file. We set the maximum characters to 100k. \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "For people converting their favorite novels into a podcast, they will have to add extra logic of going outside the Llama models context length which is 128k tokens." 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 10, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "b57c2d64-3d75-4aeb-b4ee-bd1661286b66", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -145,6 +172,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "e023397b", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Helper function to grab meta info about our PDF" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 11, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "0984bb1e-d52c-4cec-a131-67a48061fabc", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -170,6 +205,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "6019affc", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Finally, we can run our logic to extract the details from the file" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 12, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "63848943-79cc-4e21-8396-6eab5df493e0", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -269,6 +312,22 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "946d1f59", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "### Llama Pre-Processing\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Now let's proceed to justify our distaste for writing regex and use that as a justification for a LLM instead:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "At this point, have a text file extracted from a PDF of a paper. Generally PDF extracts can be messy due to characters, formatting, Latex, Tables, etc. \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "One way to handle this would be using regex, instead we can also prompt the feather light Llama models to clean up our text for us. \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Please try changing the `SYS_PROMPT` below to see what improvements you can make:" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 60, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "7c0828a5-964d-475e-b5f5-40a04e287725", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -298,6 +357,16 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "fd393fae", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Instead of having the model process the entire file at once, as you noticed in the prompt-we will pass chunks of the file. \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "One issue with passing chunks counted by characters is, we lose meaning of words so instead we chunk by words:" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 61, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "24e8a547-9d7c-4e2f-be9e-a3aea09cce76", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -332,6 +401,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "5d74223f", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Let's load in the model and start processing the text chunks" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 62, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "d04a4f07-b0b3-45ca-8f41-a433e1abe050", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -2551,6 +2628,14 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "cell_type": "markdown", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "id": "31cffe8d", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "Let's print out the final processed versions to make sure things look good" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 68, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "89ef51a7-f13f-49a4-8f73-9ac8ce75319d", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -2617,7 +2702,9 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "1b16ae0e-04cf-4eb9-a369-dee1728b89ce", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "source": [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "#fin" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  "metadata": { 
			 |