config.yaml 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. # Database Configuration
  2. database:
  3. sql_db_path: "sqlite3.db"
  4. vector_db_path: "chroma.db"
  5. # Model Configuration
  6. model:
  7. backend: openai-compat # [offline-vllm, openai-compat]
  8. # for offline-vllm, use the following
  9. path: "/path/to/checkpoint"
  10. tensor_parallel_size: 4
  11. max_model_len: 32000
  12. max_num_seqs: 32
  13. # for openai-compat, use the following
  14. base_url: "https://api.llama.com/compat/v1"
  15. api_key: ""
  16. model_id: "Llama-4-Maverick-17B-128E-Instruct-FP8"
  17. # Inference Parameters
  18. extraction_inference:
  19. temperature: 0.2
  20. top_p: 0.9
  21. max_completion_tokens: 32000
  22. seed: 42
  23. # Artifact configuration
  24. artifacts:
  25. text:
  26. prompts:
  27. system: "You are an OCR expert. Your task is to extract all the various text sections (main text, body text, titles, captions etc.) from the following document into the provided TARGET SCHEMA. \n\nDo not extract any tables or any text inside pictures and graphs.\n\nEnsure your final answer is appropriately formatted as a JSON object and wrapped in a ```json\n\n``` block."
  28. user: "TARGET SCHEMA:\n```json\n{schema}\n```"
  29. output_schema: {
  30. "type": "object",
  31. "properties": {
  32. "text": {
  33. "type": "array",
  34. "items": {
  35. "type": "object",
  36. "properties": {
  37. "content": {
  38. "type": "string",
  39. "description": "The main text content extracted from the document"
  40. },
  41. "notes": {
  42. "type": "string",
  43. "description": "Any additional notes or observations about the text"
  44. }
  45. },
  46. "required": [
  47. "content",
  48. "notes"
  49. ]
  50. }
  51. }
  52. },
  53. "required": [
  54. "text"
  55. ]
  56. }
  57. use_json_decoding: true
  58. images:
  59. prompts:
  60. system: "You are an OCR expert. (Note: Do not extract tables)\n\n1.Your task is to extract images, pictures, charts and diagrams only from the following document.\n 2. For each extracted image, you must write\n a) a caption as given in the document\n b) a detailed description of the image; utilize the surrounding text for this. Your descriptions should be very informative so that a human can understand what is in the image without ever seeing the document. Think step-by-step and write a JSON that corresponds to the schema and the information in the document\n\nIf there is nothing to extract, simply return an empty JSON {\"images\": []}. \nIf the image is a table, simply return an empty JSON {\"images\": []}. \n\nIf the image is a chart or a graph then you must convert them to JSON outputs.\n\n# Instructions to convert charts & graphs to JSON\nYour task is to: Analyze and describe the chart or graph. Summarize the type of chart/graph (e.g., bar chart, line graph, pie chart). Identify the axes, labels, categories, and any notable trends or patterns. Provide a brief textual description of what the chart/graph represents. Extract and structure the data:\n1. Capture all relevant values and data points from the chart/graph.\n2. Organize the extracted data into a clear and logical JSON structure.\n\n# Output format:\n\nYour response should be captured in the chart_data attribute of the JSON schema. Ensure your final answer is appropriately formatted as a JSON object and wrapped in a ```json\n\n``` block."
  61. user: "TARGET SCHEMA:\n```json\n{schema}\n```"
  62. output_schema: {
  63. "type": "object",
  64. "properties": {
  65. "images": {
  66. "type": "array",
  67. "items": {
  68. "type": "object",
  69. "properties": {
  70. "position_top": {
  71. "type": "string",
  72. "description": "Approximate position from top (e.g., 'top', 'middle', 'bottom')"
  73. },
  74. "position_left": {
  75. "type": "string",
  76. "description": "Approximate position from left (e.g., 'left', 'center', 'right')"
  77. },
  78. "description": {
  79. "type": "string",
  80. "description": "Detailed description of what the image shows"
  81. },
  82. "caption": {
  83. "type": "string",
  84. "description": "Caption or label associated with the image, if any"
  85. },
  86. "image_type": {
  87. "type": "string",
  88. "description": "Type of image (e.g., 'photograph', 'chart', 'diagram', 'illustration')"
  89. },
  90. "chart_data": {
  91. "type": "object",
  92. "properties": {
  93. "type": {
  94. "type": "string",
  95. "enum": ["bar", "line", "pie", "scatter", "area"]
  96. },
  97. "title": {
  98. "type": "string"
  99. },
  100. "subtitle": {
  101. "type": "string"
  102. },
  103. "xAxis": {
  104. "type": "object",
  105. "properties": {
  106. "title": { "type": "string" },
  107. "labels": {
  108. "type": "array",
  109. "items": { "type": "string" }
  110. }
  111. },
  112. "required": ["title", "labels"]
  113. },
  114. "yAxis": {
  115. "type": "object",
  116. "properties": {
  117. "title": { "type": "string" },
  118. "labels": {
  119. "type": "array",
  120. "items": { "type": "string" }
  121. }
  122. },
  123. "required": ["title", "labels"]
  124. },
  125. "data": {
  126. "type": "array",
  127. "items": {
  128. "oneOf": [
  129. {
  130. "type": "object",
  131. "properties": {
  132. "label": { "type": "string" },
  133. "values": {
  134. "type": "array",
  135. "items": { "type": "number" }
  136. }
  137. },
  138. "required": ["label", "values"]
  139. },
  140. {
  141. "type": "object",
  142. "properties": {
  143. "x": { "type": "number" },
  144. "y": { "type": "number" }
  145. },
  146. "required": ["x", "y"]
  147. }
  148. ]
  149. }
  150. },
  151. "options": {
  152. "type": "object",
  153. "properties": {
  154. "legend": { "type": "boolean" },
  155. "rtl": { "type": "boolean" },
  156. "responsive": { "type": "boolean" },
  157. "animation": { "type": "boolean" }
  158. }
  159. }
  160. },
  161. "required": ["type", "title", "xAxis", "yAxis", "data"]
  162. }
  163. },
  164. "required": [
  165. "description",
  166. "caption",
  167. "image_type"
  168. ]
  169. }
  170. }
  171. },
  172. "required": [
  173. "images"
  174. ]
  175. }
  176. use_json_decoding: false
  177. tables:
  178. prompts:
  179. system: "Your task is to extract all tables in this document and prepare it in a structured format. If there are multiple tables in a document, extract each table separately. Your goal is to accurately extract the data so it can be loaded as a pandas dataframe. \n\n1. First you must ensure that there exists some tabular information in the document. If there isn't, simply return an empty JSON \n\n```json\n{\"tables\": []}\n```. 2. Next you must understand the table structure:\n\n * how many columns are there?\n * are any of the rows actually section headers that you should extract as a separate table?\n * how should the numbers be interpreted? (e.g. are they percentages, currency, etc.)\n * are there any special characters or formatting that needs to be preserved?\n\n3. After writing down your observations about the structure and nuances, you must extract the data from the table and format it as a JSON object. Each table or sub-table in the document should be a separate JSON object."
  180. user: "Think step-by-step and write a JSON that corresponds to the schema and the information in the document. Use the following JSON structure for your output:\n\n```json\n{\"tables\": [{\"table_contents\": { < dict, valid json of extracted table> }, \"table_info\": <str, a detailed description of the table and the information it represents>}, {\"table_contents\": { < dict, valid json of extracted table> }, \"table_info\": <str, a detailed description of the table and the information it represents>} ...]\n}\n```\n\nIf there is nothing to extract, simply return an empty JSON \n\n```json\n{\"tables\": []}\n```"
  181. use_json_decoding: false
  182. charts:
  183. prompts:
  184. system: "You are an OCR expert. Your task is to extract charts, graphs and other graphical visualizations of data from the following document into JSON. Do not extract tables, only analyze and extract the charts in this document.\n\n### Steps:\n\n1. Identify the type of chart/graph (e.g., bar chart, line graph, pie chart).\n2. Identify the axes, labels, categories, and any notable trends or patterns.\n3. Provide a brief textual description of what the chart/graph represents; use the caption and surrounding text in the document to write this.\n4. Extract and structure the data into a valid and logical JSON structure.\n5. If there are no charts or graphs to extract, simply return an empty JSON {\"charts\": []}. Ensure your final answer is appropriately formatted as a JSON object and wrapped in a ```json\n\n``` block."
  185. user: "TARGET SCHEMA:\n```json\n{schema}\n```"
  186. output_schema: {
  187. "type": "object",
  188. "properties": {
  189. "charts": {
  190. "type": "array",
  191. "items": {
  192. "type": "object",
  193. "properties": {
  194. "chart_type": {
  195. "type": "string",
  196. "enum": ["bar", "line", "pie", "scatter", "area", "sankey", "heatmap"]
  197. },
  198. "description": {
  199. "type": "string",
  200. "description": "Detailed description of what the image shows"
  201. },
  202. "caption": {
  203. "type": "string",
  204. "description": "Caption or label associated with the image, if any"
  205. },
  206. "data": {
  207. "type": "object",
  208. "description": "Data in the chart; use appropriate keys and values to best represent the information in the chart",
  209. }
  210. },
  211. "required": [
  212. "chart_type",
  213. "description",
  214. "caption",
  215. "data"
  216. ]
  217. }
  218. }
  219. },
  220. "required": [
  221. "charts"
  222. ]
  223. }
  224. use_json_decoding: false