Kevin Slagle 8 месяцев назад
Родитель
Сommit
52a85e1564
100 измененных файлов с 12304 добавлено и 67 удалено
  1. 1 1
      scripts/check_copyright_header.py
  2. 0 0
      .github/scripts/markdown_link_check_config.json
  3. 1 1
      scripts/spellcheck.sh
  4. 2 2
      scripts/spellcheck_conf/spellcheck.yaml
  5. 83 1
      scripts/spellcheck_conf/wordlist.txt
  6. 5 5
      .github/workflows/spellcheck.yml
  7. 4 4
      CONTRIBUTING.md
  8. 47 18
      README.md
  9. 6 6
      UPDATES.md
  10. 1 1
      docs/FAQ.md
  11. 4 6
      docs/LLM_finetuning.md
  12. 0 0
      docs/img/feature_based_fn.png
  13. 0 0
      docs/img/feature_based_fn_2.png
  14. 0 0
      docs/img/full_param_fn.png
  15. 0 0
      docs/img/llama2_gradio.png
  16. 0 0
      docs/img/llama2_streamlit.png
  17. 0 0
      docs/img/llama2_streamlit2.png
  18. 0 0
      docs/img/messenger_api_settings.png
  19. 0 0
      docs/img/messenger_llama_arch.jpg
  20. 0 0
      docs/img/wandb_screenshot.png
  21. 0 0
      docs/img/whatsapp_dashboard.jpg
  22. 0 0
      docs/img/whatsapp_llama_arch.jpg
  23. 23 15
      docs/multi_gpu.md
  24. 9 6
      docs/single_gpu.md
  25. 1 1
      pyproject.toml
  26. 8 0
      recipes/3p_integrations/README.md
  27. 0 0
      recipes/3p_integrations/aws/getting_started_llama_3_on_amazon_bedrock.ipynb
  28. 0 0
      recipes/3p_integrations/aws/prompt_engineering_with_llama_2_on_amazon_bedrock.ipynb
  29. 0 0
      recipes/3p_integrations/aws/react_llama_3_bedrock_wk.ipynb
  30. 494 0
      recipes/3p_integrations/azure/Azure MaaS/azure_api_example.ipynb
  31. 2 0
      recipes/3p_integrations/azure/README.md
  32. 1038 0
      recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/Function-Calling-101-Ecommerce.ipynb
  33. 41 0
      recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/customers.csv
  34. 21 0
      recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/orders.csv
  35. 21 0
      recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/products.csv
  36. 8 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/employees.csv
  37. 6 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/purchases.csv
  38. 677 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/json-mode-function-calling-for-sql.ipynb
  39. 7 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/employees-without-purchases.yaml
  40. 9 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-expensive-purchase.yaml
  41. 11 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-recent-purchases.yaml
  42. 6 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/number-of-teslas.yaml
  43. 639 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/SDOH-Json-mode.ipynb
  44. 31 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00456321.txt
  45. 28 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00567289.txt
  46. 28 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00678934.txt
  47. 32 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00785642.txt
  48. 30 0
      recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00893247.txt
  49. 427 0
      recipes/3p_integrations/groq/groq-api-cookbook/llama3-stock-market-function-calling/llama3-stock-market-function-calling.ipynb
  50. 340 0
      recipes/3p_integrations/groq/groq-api-cookbook/parallel-tool-use/parallel-tool-use.ipynb
  51. 2 0
      recipes/3p_integrations/groq/groq-api-cookbook/parallel-tool-use/requirements.txt
  52. 993 0
      recipes/3p_integrations/groq/groq-api-cookbook/rag-langchain-presidential-speeches/presidential_speeches.csv
  53. 664 0
      recipes/3p_integrations/groq/groq-api-cookbook/rag-langchain-presidential-speeches/rag-langchain-presidential-speeches.ipynb
  54. 21 0
      recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/README.md
  55. 74 0
      recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/main.py
  56. 0 0
      recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/requirements.txt
  57. 23 0
      recipes/3p_integrations/groq/groq-example-templates/crewai-agents/README.md
  58. 184 0
      recipes/3p_integrations/groq/groq-example-templates/crewai-agents/main.py
  59. 3 0
      recipes/3p_integrations/groq/groq-example-templates/crewai-agents/requirements.txt
  60. 21 0
      recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/README.md
  61. 38 0
      recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/main.py
  62. 1 0
      recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/requirements.txt
  63. 27 0
      recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/README.md
  64. 139 0
      recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/main.py
  65. 12 0
      recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/requirements.txt
  66. 21 0
      recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/README.md
  67. 46 0
      recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/main.py
  68. 2 0
      recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/requirements.txt
  69. 33 0
      recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/README.md
  70. 114 0
      recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/main.py
  71. 8 0
      recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/requirements.txt
  72. 57 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/README.md
  73. 8 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/data/employees.csv
  74. 6 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/data/purchases.csv
  75. 145 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/main.py
  76. 42 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/prompts/base_prompt.txt
  77. 4 0
      recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/requirements.txt
  78. 53 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/README.md
  79. 8 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/data/employees.csv
  80. 6 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/data/purchases.csv
  81. 158 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/main.py
  82. 9 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/requirements.txt
  83. 7 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/employees-without-purchases.yaml
  84. 9 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-expensive-purchase.yaml
  85. 9 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-recent-purchases.yaml
  86. 6 0
      recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/number-of-teslas.yaml
  87. 1708 0
      recipes/3p_integrations/groq/llama3_cookbook_groq.ipynb
  88. 26 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/README.md
  89. BIN
      recipes/3p_integrations/lamini/text2sql_memory_tuning/assets/manual_filtering.png
  90. BIN
      recipes/3p_integrations/lamini/text2sql_memory_tuning/assets/website.png
  91. 40 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/gold-test-set-v2.jsonl
  92. 20 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/gold-test-set.jsonl
  93. 220 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/archive/generated_queries_large_filtered_cleaned.jsonl
  94. 128 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/archive/generated_queries_v2_large_filtered_cleaned.jsonl
  95. 159 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries.jsonl
  96. 1149 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_large.jsonl
  97. 330 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_large_filtered.jsonl
  98. 226 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2.jsonl
  99. 1254 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2_large.jsonl
  100. 0 0
      recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2_large_filtered.jsonl

+ 1 - 1
scripts/check_copyright_header.py

@@ -11,7 +11,7 @@ HEADER = """# Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.\n\n"""
 
 #Files in black list must be relative to main repo folder
-BLACKLIST = ["eval/open_llm_leaderboard/hellaswag_utils.py"]
+BLACKLIST = ["tools/benchmarks/llm_eval_harness/open_llm_leaderboard/hellaswag_utils.py"]
 
 if __name__ == "__main__":
     for ext in ["*.py", "*.sh"]:


+ 1 - 1
scripts/spellcheck.sh

@@ -19,5 +19,5 @@ done
 if [ ! "$sources_arg" ]; then
 	echo "No files to spellcheck"
 else
-	pyspelling -c scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
+	pyspelling -c .github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
 fi

+ 2 - 2
scripts/spellcheck_conf/spellcheck.yaml

@@ -5,8 +5,8 @@ matrix:
     d: en_US
   dictionary:
     wordlists:
-    - scripts/spellcheck_conf/wordlist.txt
-    output: scripts/spellcheck_conf/wordlist.dic
+    - .github/scripts/spellcheck_conf/wordlist.txt
+    output: .github/scripts/spellcheck_conf/wordlist.dic
     encoding: utf-8
   pipeline:
   - pyspelling.filters.context:

+ 83 - 1
scripts/spellcheck_conf/wordlist.txt

@@ -1350,4 +1350,86 @@ SalesBot
 Weaviate
 MediaGen
 SDXL
-SVD
+SVD
+QLORA
+Agentic
+AutoGen
+DeepLearning
+Deeplearning
+Llamaindex
+KV
+KVs
+XSUM
+contrains
+knowlege
+kv
+prefilling
+DataFrame
+DuckDB
+Groq
+GroqCloud
+Replit
+Teslas
+duckdb
+teslas
+Groqs
+groq
+schemas
+Pinecone
+Pinecone's
+Repl
+docsearch
+presidental
+CrewAI
+kickstart
+DataFrames
+Groqing
+Langchain
+Plotly
+dfs
+yfinance
+Groq's
+LlamaChat
+chatbot's
+ConversationBufferWindowMemory
+chatbot's
+Lamini
+lamini
+nba
+sqlite
+customerservice
+fn
+ExecuTorch
+LLMScore
+RecursiveCharacterTextSplitter
+TPD
+TPM
+Tianjun
+Zhang
+distractor
+distractors
+frac
+numRefusal
+totalQA
+DirectoryLoader
+SitemapLoader
+nf
+quant
+DLAI
+agentic
+containts
+dlai
+Prerequirements
+tp
+QLoRA
+ntasks
+srun
+xH
+unquantized
+eom
+ipython
+CPUs
+modelUpgradeExample
+guardrailing
+MaaS
+MFU

+ 5 - 5
.github/workflows/spellcheck.yml

@@ -20,11 +20,11 @@ jobs:
         uses: gaurav-nelson/github-action-markdown-link-check@1.0.13
         with:
           use-verbose-mode: 'yes'
-          config-file: "scripts/markdown_link_check_config.json"
+          config-file: ".github/scripts/markdown_link_check_config.json"
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
 
           files: |
@@ -42,7 +42,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
           files: |
             **/*.md
@@ -56,11 +56,11 @@ jobs:
           if [ ! "$sources" ]; then
             echo "No files to spellcheck"
           else
-            pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
+            pyspelling -c $GITHUB_WORKSPACE/.github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
           fi
 
       - name: In the case of misspellings
         if: ${{ failure() }}
         run: |
           echo "Please fix the misspellings. If you are sure about some of them, "
-          echo "so append those to scripts/spellcheck_conf/wordlist.txt"
+          echo "so append those to .github/scripts/spellcheck_conf/wordlist.txt"

+ 4 - 4
CONTRIBUTING.md

@@ -43,17 +43,17 @@ For development and contributing to llama-recipes please install from source wit
 pip install -U pip setuptools
 pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e .[tests,auditnlg,vllm]
 ```
-The unit tests can be found in the [tests](./tests/) folder and you can run them from the main directory using:
+The unit tests can be found in the [src/tests](./src/tests/) folder and you can run them from the main directory using:
 ```
-python -m pytest tests/
+python -m pytest src/tests/
 ```
 To run all tests of a single file you can give the filename directly:
 ```
-python -m pytest tests/test_finetuning.py
+python -m pytest src/tests/test_finetuning.py
 ```
 To run a specific test you can filter for its name with
 ```
-python -m pytest tests/test_finetuning.py -k test_finetuning_peft
+python -m pytest src/tests/test_finetuning.py -k test_finetuning_peft
 ```
 To add a new test simply create a new test file under the tests folder (filename has to start with `test_`).
 Group tests spanning the same feature in the same file and create a subfolder if the tests are very extensive.

Разница между файлами не показана из-за своего большого размера
+ 47 - 18
README.md


+ 6 - 6
UPDATES.md

@@ -1,19 +1,19 @@
 ## System Prompt Update
 
 ### Observed Issue
-We received feedback from the community on our prompt template and we are providing an update to reduce the false refusal rates seen. False refusals occur when the model incorrectly refuses to answer a question that it should, for example due to overly broad instructions to be cautious in how it provides responses. 
+We received feedback from the community on our prompt template and we are providing an update to reduce the false refusal rates seen. False refusals occur when the model incorrectly refuses to answer a question that it should, for example due to overly broad instructions to be cautious in how it provides responses.
 
 ### Updated approach
-Based on evaluation and analysis, we recommend the removal of the system prompt as the default setting.  Pull request [#626](https://github.com/facebookresearch/llama/pull/626) removes the system prompt as the default option, but still provides an example to help enable experimentation for those using it. 
+Based on evaluation and analysis, we recommend the removal of the system prompt as the default setting.  Pull request [#626](https://github.com/facebookresearch/llama/pull/626) removes the system prompt as the default option, but still provides an example to help enable experimentation for those using it.
 
 ## Token Sanitization Update
 
 ### Observed Issue
-The PyTorch scripts currently provided for tokenization and model inference allow for direct prompt injection via string concatenation. Prompt injections allow for the addition of special system and instruction prompt strings from user-provided prompts. 
+The PyTorch scripts currently provided for tokenization and model inference allow for direct prompt injection via string concatenation. Prompt injections allow for the addition of special system and instruction prompt strings from user-provided prompts.
 
-As noted in the documentation, these strings are required to use the fine-tuned chat models. However, prompt injections have also been used for manipulating or abusing models by bypassing their safeguards, allowing for the creation of content or behaviors otherwise outside the bounds of acceptable use. 
+As noted in the documentation, these strings are required to use the fine-tuned chat models. However, prompt injections have also been used for manipulating or abusing models by bypassing their safeguards, allowing for the creation of content or behaviors otherwise outside the bounds of acceptable use.
 
 ### Updated approach
-We recommend sanitizing [these strings](https://github.com/meta-llama/llama?tab=readme-ov-file#fine-tuned-chat-models) from any user provided prompts. Sanitization of user prompts mitigates malicious or accidental abuse of these strings. The provided scripts have been updated to do this. 
+We recommend sanitizing [these strings](https://github.com/meta-llama/llama?tab=readme-ov-file#fine-tuned-chat-models) from any user provided prompts. Sanitization of user prompts mitigates malicious or accidental abuse of these strings. The provided scripts have been updated to do this.
 
-Note: even with this update safety classifiers should still be applied to catch unsafe behaviors or content produced by the model. An [example](./recipes/inference/local_inference/inference.py) of how to deploy such a classifier can be found in the llama-recipes repository.
+Note: even with this update safety classifiers should still be applied to catch unsafe behaviors or content produced by the model. An [example](./recipes/quickstart/inference/local_inference/inference.py) of how to deploy such a classifier can be found in the llama-recipes repository.

+ 1 - 1
docs/FAQ.md

@@ -16,7 +16,7 @@ Here we discuss frequently asked questions that may occur and we found useful al
 
 4. Can I add custom datasets?
 
-    Yes, you can find more information on how to do that [here](../recipes/finetuning/datasets/README.md).
+    Yes, you can find more information on how to do that [here](../recipes/quickstart/finetuning/datasets/README.md).
 
 5. What are the hardware SKU requirements for deploying these models?
 

+ 4 - 6
docs/LLM_finetuning.md

@@ -1,6 +1,6 @@
 ## LLM Fine-Tuning
 
-Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
+Here we discuss fine-tuning Meta Llama with a couple of different recipes. We will cover two scenarios here:
 
 
 ## 1. **Parameter Efficient Model Fine-Tuning**
@@ -18,8 +18,6 @@ These methods will address three aspects:
 
 HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft).
 
-
-
 ## 2. **Full/ Partial Parameter Fine-Tuning**
 
 Full parameter fine-tuning has its own advantages, in this method there are multiple strategies that can help:
@@ -35,9 +33,9 @@ Full parameter fine-tuning has its own advantages, in this method there are mult
 You can also keep most of the layers frozen and only fine-tune a few layers. There are many different techniques to choose from to freeze/unfreeze layers based on different criteria.
 
 <div style="display: flex;">
-    <img src="./images/feature-based_FN.png" alt="Image 1" width="250" />
-    <img src="./images/feature-based_FN_2.png" alt="Image 2" width="250" />
-    <img src="./images/full-param-FN.png" alt="Image 3" width="250" />
+    <img src="./img/feature_based_fn.png" alt="Image 1" width="250" />
+    <img src="./img/feature_based_fn_2.png" alt="Image 2" width="250" />
+    <img src="./img/full_param_fn.png" alt="Image 3" width="250" />
 </div>
 
 

docs/images/feature-based_FN.png → docs/img/feature_based_fn.png


docs/images/feature-based_FN_2.png → docs/img/feature_based_fn_2.png


docs/images/full-param-FN.png → docs/img/full_param_fn.png


docs/images/llama2-gradio.png → docs/img/llama2_gradio.png


docs/images/llama2-streamlit.png → docs/img/llama2_streamlit.png


docs/images/llama2-streamlit2.png → docs/img/llama2_streamlit2.png


docs/images/messenger_api_settings.png → docs/img/messenger_api_settings.png


docs/images/messenger_llama_arch.jpg → docs/img/messenger_llama_arch.jpg


docs/images/wandb_screenshot.png → docs/img/wandb_screenshot.png


docs/images/whatsapp_dashboard.jpg → docs/img/whatsapp_dashboard.jpg


docs/images/whatsapp_llama_arch.jpg → docs/img/whatsapp_llama_arch.jpg


+ 23 - 15
docs/multi_gpu.md

@@ -6,12 +6,11 @@ To run fine-tuning on multi-GPUs, we will  make use of two packages:
 
 2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](LLM_finetuning.md/#2-full-partial-parameter-finetuning).
 
-Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 3 8B model on multiple GPUs in one node or multi-node.
+Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 8B model on multiple GPUs in one node.
+For big models like 405B we will need to fine-tune in a multi-node setup even if 4bit quantization is enabled.
 
 ## Requirements
-To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details).
-
-**Please note that the llama_recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.**
+To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/quickstart/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details).
 
 ## How to run it
 
@@ -24,7 +23,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -34,7 +33,7 @@ The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 
-* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
+* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`.
 
 We use `torchrun` here to spawn multiple processes for FSDP.
 
@@ -43,7 +42,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
 ```
 
 ### Fine-tuning using FSDP Only
@@ -52,8 +51,16 @@ If interested in running full parameter finetuning without making use of PEFT me
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --use_fast_kernels
+
+```
+
+### Fine-tuning using FSDP + QLORA
+
+This has been tested on 4 H100s GPUs.
 
+```bash
+ FSDP_CPU_RAM_EFFICIENT_LOADING=1 ACCELERATE_USE_FSDP=1 torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --quantization 4bit --model_name /path_of_model_folder/70B  --mixed_precision False --low_cpu_fsdp --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 ```
 
 ### Fine-tuning using FSDP on 70B Model
@@ -62,7 +69,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 recipes/quickstart/finetuning/finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
 
 ```
 
@@ -72,7 +79,7 @@ Here we use a slurm script to schedule a job with slurm over multiple nodes.
 
 ```bash
 
-sbatch examples/multi_node.slurm
+sbatch recipes/quickstart/finetuning/multi_node.slurm
 # Change the num nodes and GPU per nodes in the script before running.
 
 ```
@@ -95,16 +102,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -138,8 +145,9 @@ It lets us specify the training settings for everything from `model_name` to `da
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
@@ -181,7 +189,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
 * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option.
 
-* `pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
+* `fsdp_config.pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
 
 ## FLOPS Counting and Pytorch Profiling
 

+ 9 - 6
docs/single_gpu.md

@@ -17,17 +17,18 @@ To run the examples, make sure to install the llama-recipes package (See [README
 
 Get access to a machine with one GPU or if using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id` and run the following. It runs by default with `samsum_dataset` for summarization application.
 
+**NOTE** To run the fine-tuning with `QLORA`, make sure to set `--peft_method lora` and `--quantization int4`.
 
 ```bash
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 
-* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
+* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`.
 
 * `--quantization` boolean flag to enable int8 quantization
 
@@ -51,16 +52,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 ```bash
 # grammer_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -94,8 +95,9 @@ It let us specify the training settings, everything from `model_name` to `datase
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
@@ -112,6 +114,7 @@ It let us specify the training settings, everything from `model_name` to `datase
     flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
     use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
     profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
+
 ```
 
 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.

+ 1 - 1
pyproject.toml

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-recipes"
-version = "0.0.1"
+version = "0.0.3"
 authors = [
   { name="Hamid Shojanazeri", email="hamidnazeri@meta.com" },
   { name="Matthias Reso", email="mreso@meta.com" },

+ 8 - 0
recipes/3p_integrations/README.md

@@ -0,0 +1,8 @@
+## Llama-Recipes 3P Integrations
+
+This folder contains example scripts showcasing the use of Meta Llama with popular platforms and tooling in the LLM ecosystem. 
+
+Each folder is maintained by the platform-owner. 
+
+> [!NOTE]
+> If you'd like to add your platform here, please open a new issue with details of your examples.

recipes/llama_api_providers/examples_with_aws/getting_started_llama_3_on_amazon_bedrock.ipynb → recipes/3p_integrations/aws/getting_started_llama_3_on_amazon_bedrock.ipynb


recipes/llama_api_providers/examples_with_aws/Prompt_Engineering_with_Llama_2_On_Amazon_Bedrock.ipynb → recipes/3p_integrations/aws/prompt_engineering_with_llama_2_on_amazon_bedrock.ipynb


recipes/llama_api_providers/examples_with_aws/ReAct_Llama_3_Bedrock-WK.ipynb → recipes/3p_integrations/aws/react_llama_3_bedrock_wk.ipynb


+ 494 - 0
recipes/3p_integrations/azure/Azure MaaS/azure_api_example.ipynb

@@ -0,0 +1,494 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Use Azure API with Llama 3.1\n",
+        "\n",
+        "This notebook shows examples of how to use Llama 3.1 APIs offered by Microsoft Azure. We will cover:  \n",
+        "* HTTP requests API usage for Llama 3.1 instruct models in CLI\n",
+        "* HTTP requests API usage for Llama 3.1 instruct models in Python\n",
+        "* Plug the APIs into LangChain\n",
+        "* Wire the model with Gradio to build a simple chatbot with memory\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Prerequisite\n",
+        "\n",
+        "Before we start building with Azure Llama 3.1 APIs, there are certain steps we need to take to deploy the models:\n",
+        "\n",
+        "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
+        "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
+        "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group.\n",
+        "* For Llama 3.1 instruct models from Model catalog, click Deploy in the model page and select \"Serverless API with Azure AI Content Safety\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
+        "* For Llama 3.1 pretrained models, Azure currently only support manual deployment under regular subscription. This means you will need to acquire a virtual machine with managed compute resource. We won't cover it here in this tutorial.\n",
+        "\n",
+        "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## HTTP Requests API Usage in CLI\n",
+        "\n",
+        "### Basics\n",
+        "\n",
+        "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
+        "\n",
+        "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
+        "This can be acquired from previous steps.  \n",
+        "\n",
+        "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
+        "\n",
+        "* The `host-url` is your endpoint url with completion schema. \n",
+        "* The `headers` defines the content type as well as your api key. \n",
+        "* The `payload` or `data`, which is your prompt detail and model hyper parameters."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
+        "\n",
+        "```\n",
+        "{ \n",
+        "  \"messages\": [ \n",
+        "    { \n",
+        "      \"content\": \"You are a helpful assistant.\", \n",
+        "      \"role\": \"system\" \n",
+        "},  \n",
+        "    { \n",
+        "      \"content\": \"Hello!\", \n",
+        "      \"role\": \"user\" \n",
+        "    } \n",
+        "  ], \n",
+        "  \"max_tokens\": 50, \n",
+        "} \n",
+        "```\n",
+        "\n",
+        "Here is a sample curl call for chat completion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"What is good about Wuhan?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Streaming\n",
+        "\n",
+        "One fantastic feature the API offers is the streaming capability.  \n",
+        "Streaming allows the generated tokens to be sent as data-only server-sent events whenever they become available.  \n",
+        "This is extremely important for interactive applications such as chatbots, so the user is always engaged.  \n",
+        "\n",
+        "To use streaming, simply set `\"stream\":true` as part of the request payload.  \n",
+        "In the streaming mode, the REST API response will be different from non-streaming mode.\n",
+        "\n",
+        "Here is an example: "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"What is good about Wuhan?\",\"role\":\"user\"}], \"max_tokens\": 500, \"stream\": true}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "As you can see the result comes back as a stream of `data` objects, each contains generated information including a `choice`.  \n",
+        "The stream terminated by a `data:[DONE]\\n\\n` message."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Content Safety Filtering\n",
+        "\n",
+        "If you enabled content filtering during deployment, Azure Llama 3.1 API endpoints will have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
+        "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
+        "\n",
+        "For model input and output, if the filter detects there is harmful content, the generation will error out with additional information. \n",
+        "\n",
+        "If you disabled content filtering during deployment, Llama models had content safety built-in for generation. It will refuse to answer your questions if any harmful content was detected.\n",
+        "\n",
+        "Here is an example prompt that triggered content safety filtering:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/chat/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"messages\":[{\"content\":\"You are a helpful assistant.\",\"role\":\"system\"},{\"content\":\"How to make bomb?\",\"role\":\"user\"}], \"max_tokens\": 50}'"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## HTTP Requests API Usage in Python\n",
+        "\n",
+        "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
+        "\n",
+        "Here is an example for the instruct model:\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import urllib.request\n",
+        "import json\n",
+        "\n",
+        "#Configure payload data sending to API endpoint\n",
+        "data = {\"messages\":[\n",
+        "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
+        "            {\"role\":\"user\", \"content\":\"What is good about Wuhan?\"}],\n",
+        "        \"max_tokens\": 500,\n",
+        "        \"temperature\": 0.9,\n",
+        "        \"stream\": True,\n",
+        "}\n",
+        "\n",
+        "body = str.encode(json.dumps(data))\n",
+        "\n",
+        "#Replace the url with your API endpoint\n",
+        "url = 'https://your-endpoint.inference.ai.azure.com/v1/chat/completions'\n",
+        "\n",
+        "#Replace this with the key for the endpoint\n",
+        "api_key = 'your-auth-key'\n",
+        "if not api_key:\n",
+        "    raise Exception(\"API Key is missing\")\n",
+        "\n",
+        "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
+        "\n",
+        "req = urllib.request.Request(url, body, headers)\n",
+        "\n",
+        "try:\n",
+        "    response = urllib.request.urlopen(req)\n",
+        "    result = response.read()\n",
+        "    print(result)\n",
+        "except urllib.error.HTTPError as error:\n",
+        "    print(\"The request failed with status code: \" + str(error.code))\n",
+        "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
+        "    print(error.info())\n",
+        "    print(error.read().decode(\"utf8\", 'ignore'))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However in this example, the streamed data content returns back as a single payload. It didn't stream as a serial of data events as we wished. To build true streaming capabilities utilizing the API endpoint, we will utilize the [`requests`](https://requests.readthedocs.io/en/latest/) library instead."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Streaming in Python\n",
+        "\n",
+        "`Requests` library is a simple HTTP library for Python built with [`urllib3`](https://github.com/urllib3/urllib3). It automatically maintains the keep-alive and HTTP connection pooling. With the `Session` class, we can easily stream the result from our API calls.  \n",
+        "\n",
+        "Here is a quick example:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "import requests\n",
+        "\n",
+        "data = {\"messages\":[\n",
+        "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
+        "            {\"role\":\"user\", \"content\":\"What is good about Wuhan?\"}],\n",
+        "        \"max_tokens\": 500,\n",
+        "        \"temperature\": 0.9,\n",
+        "        \"stream\": True\n",
+        "}\n",
+        "\n",
+        "\n",
+        "def post_stream(url):\n",
+        "    s = requests.Session()\n",
+        "    api_key = \"your-auth-key\"\n",
+        "    headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
+        "\n",
+        "    with s.post(url, data=json.dumps(data), headers=headers, stream=True) as resp:\n",
+        "        print(resp.status_code)\n",
+        "        for line in resp.iter_lines():\n",
+        "            if line:\n",
+        "                print(line)\n",
+        "\n",
+        "\n",
+        "url = \"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\"\n",
+        "post_stream(url)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Use Llama 3.1 API with LangChain\n",
+        "\n",
+        "In this section, we will demonstrate how to use Llama 3.1 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
+        "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
+        "In this example, we will use the `AzureMLChatOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
+        "\n",
+        "First, let's install dependencies: \n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install langchain"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Once all dependencies are installed, you can directly create a `llm` instance based on `AzureMLChatOnlineEndpoint` as follows:  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain_community.chat_models.azureml_endpoint import (\n",
+        "    AzureMLEndpointApiType,\n",
+        "    CustomOpenAIChatContentFormatter,\n",
+        "    AzureMLChatOnlineEndpoint,\n",
+        ")\n",
+        "\n",
+        "from langchain_core.messages import HumanMessage\n",
+        "\n",
+        "llm = AzureMLChatOnlineEndpoint(\n",
+        "    endpoint_api_key=\"your-auth-key\",\n",
+        "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
+        "    endpoint_api_type=AzureMLEndpointApiType.serverless,\n",
+        "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 256, \"top_p\": 0.9},\n",
+        "    content_formatter=CustomOpenAIChatContentFormatter(),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "However, you might wonder what is the `CustomOpenAIChatContentFormatter` in the context when creating the `llm` instance?   \n",
+        "The `CustomOpenAIChatContentFormatter` is a [handler class](https://python.langchain.com/docs/integrations/llms/azure_ml#content-formatter) for transforming the request and response of an AzureML endpoint to match with required schema. Since there are various models in the Azure model catalog, each of which needs to handle the data accordingly.  \n",
+        "In our case, we can use the default `CustomOpenAIChatContentFormatter` which can handle Llama model schemas. If you need to have special handlings, you can customize this specific class. \n",
+        "\n",
+        "Once you have the `llm` ready, you can simple inference it by:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.invoke([HumanMessage(content=\"What is good about Wuhan?\")])\n",
+        "response"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Here is an example that you can create a translator chain with the `llm` instance and translate English to French:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.chains import LLMChain\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "\n",
+        "template = \"\"\"\n",
+        "You are a Translator. Translate the following content from {input_language} to {output_language} and reply with only the translated result.\n",
+        "{input_content}\n",
+        "\"\"\"\n",
+        "\n",
+        "translator_chain = LLMChain(\n",
+        "    llm = llm,\n",
+        "    prompt = PromptTemplate(\n",
+        "            template=template,\n",
+        "            input_variables=[\"input_language\", \"output_language\", \"input_content\"],\n",
+        "        ),\n",
+        ")\n",
+        "\n",
+        "print(translator_chain.run(input_language=\"English\", output_language=\"French\", input_content=\"What is good about Wuhan?\"))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Build a chatbot with Llama 3.1 API\n",
+        "\n",
+        "In this section, we will build a simple chatbot using Azure Llama 3.1 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
+        "\n",
+        "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
+        "\n",
+        "First, let's install Gradio dependencies.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install gradio==4.39.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's use `AzureMLChatOnlineEndpoint` class from the previous example.  \n",
+        "In this example, we have three major components:  \n",
+        "1. Chatbot UI hosted as web interface by Gradio. These are the UI logics that render our model predictions.\n",
+        "2. Model itself, which is the core component that ingests prompts and returns an answer back.\n",
+        "3. Memory component, which stores previous conversation context. In this example, we will use [conversation window buffer](https://python.langchain.com/docs/modules/memory/types/buffer_window) which logs context in certain time window in the past. \n",
+        "\n",
+        "All of them are chained together using LangChain."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import gradio as gr\n",
+        "import langchain\n",
+        "from langchain.chains import ConversationChain\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "from langchain.memory import ConversationBufferWindowMemory\n",
+        "from langchain_core.messages import HumanMessage\n",
+        "from langchain_community.chat_models.azureml_endpoint import (\n",
+        "    AzureMLEndpointApiType,\n",
+        "    CustomOpenAIChatContentFormatter,\n",
+        "    AzureMLChatOnlineEndpoint,\n",
+        ")\n",
+        "\n",
+        "llm = AzureMLChatOnlineEndpoint(\n",
+        "    endpoint_api_key=\"your-auth-key\",\n",
+        "    endpoint_url=\"https://your-endpoint.inference.ai.azure.com/v1/chat/completions\",\n",
+        "    endpoint_api_type=AzureMLEndpointApiType.serverless,\n",
+        "    model_kwargs={\"temperature\": 0.6, \"max_tokens\": 256, \"top_p\": 0.9},\n",
+        "    content_formatter=CustomOpenAIChatContentFormatter(),\n",
+        ")\n",
+        "\n",
+        "langchain.debug=True\n",
+        "\n",
+        "#Create memory\n",
+        "memory = ConversationBufferWindowMemory(llm=llm, k=5, memory_key=\"chat_history\", ai_prefix=\"Assistant\", human_prefix=\"User\")\n",
+        "\n",
+        "#Create input prompt template with chat history for chaining\n",
+        "INPUT_TEMPLATE = \"\"\"Current conversation:\n",
+        "{chat_history}\n",
+        "\n",
+        "User question:{input}\"\"\"\n",
+        "\n",
+        "conversation_prompt_template = PromptTemplate(\n",
+        "    input_variables=[\"chat_history\", \"input\"], template=INPUT_TEMPLATE\n",
+        ")\n",
+        "\n",
+        "conversation_chain_with_memory = ConversationChain(\n",
+        "    llm = llm,\n",
+        "    prompt = conversation_prompt_template,\n",
+        "    verbose = True,\n",
+        "    memory = memory,\n",
+        ")\n",
+        "\n",
+        "#Prediction\n",
+        "def predict(message, history):\n",
+        "    history_format = []\n",
+        "    for user, assistant in history:\n",
+        "        history_format.append({\"role\": \"user\", \"content\": user })\n",
+        "        history_format.append({\"role\": \"assistant\", \"content\":assistant})\n",
+        "    history_format.append({\"role\": \"user\", \"content\": message})\n",
+        "    response = conversation_chain_with_memory.run(input=message)\n",
+        "    return response\n",
+        "\n",
+        "#Launch Gradio chatbot interface\n",
+        "gr.ChatInterface(predict).launch()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "After successfully executing the code above, a chat interface should appear as the interactive output or you can open the localhost url in your selected browser window. You can see how amazing it is to build a AI chatbot just in few lines of code.\n",
+        "\n",
+        "This concludes our tutorial and examples. Here are some additional reference:  \n",
+        "* [Fine-tune Llama](https://learn.microsoft.com/azure/ai-studio/how-to/fine-tune-model-llama)\n",
+        "* [Plan and manage costs (marketplace)](https://learn.microsoft.com/azure/ai-studio/how-to/costs-plan-manage#monitor-costs-for-models-offered-through-the-azure-marketplace)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "fileHeader": "",
+    "fileUid": "599e1edd-cd59-4e55-823f-17157fc07b18",
+    "isAdHoc": false,
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}

+ 2 - 0
recipes/3p_integrations/azure/README.md

@@ -0,0 +1,2 @@
+In this folder, we show various recipes for Llama models working with Azure AI services. This includes:
+* Examples for running Llama model inference on Azure's serverless API offerings (aka. MaaS)

Разница между файлами не показана из-за своего большого размера
+ 1038 - 0
recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/Function-Calling-101-Ecommerce.ipynb


+ 41 - 0
recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/customers.csv

@@ -0,0 +1,41 @@
+customer_id,name,email,address
+1,Erin Boyle MD,erin.boyle.md@example.com,"165 Brown Springs
+Michaelport, IL 60228"
+2,Matthew Saunders,matthew.saunders@example.com,"219 Steven Mountains
+Port Gabriellafort, OH 52281"
+3,Amanda Anderson,amanda.anderson@example.com,"498 Laurie Glens
+Mitchelltown, CT 93655"
+4,Julian Butler,julian.butler@example.com,"909 Rodriguez Harbors Suite 119
+New Tracyburgh, MS 15487"
+5,Zachary Mitchell MD,zachary.mitchell.md@example.com,"9087 Matthew Drives
+Caitlynshire, OR 42442"
+6,Troy Bennett,troy.bennett@example.com,"73329 Kimberly Loaf Apt. 029
+Shellyborough, TX 55939"
+7,Allison Hall,allison.hall@example.com,"210 Shannon Camp
+New Michael, MO 65990"
+8,Carolyn Davis,carolyn.davis@example.com,"64228 Carol Courts Suite 087
+New Micheleshire, MT 42516"
+9,Cindy Munoz,cindy.munoz@example.com,"1722 Christine Plaza
+Danielport, UT 12261"
+10,Tom Testuser,tom.testuser@example.com,"451 Victoria Bridge Suite 529
+Pageton, WI 27404"
+11,Charles Walker,charles.walker@example.com,"2077 Lamb Drive
+Salazarton, IN 54619"
+12,Brianna Molina,brianna.molina@example.com,"586 Khan Mills Suite 202
+Lake Dominique, VA 98527"
+13,Austin Andrade,austin.andrade@example.com,"4857 Donna Cliffs
+Floydstad, PR 82540"
+14,Brandon Andrade,brandon.andrade@example.com,"906 Olivia Motorway
+Kelleyfort, AK 48960"
+15,Diane Lam,diane.lam@example.com,"070 Eric Rapid Suite 159
+Townsendbury, MI 57664"
+16,Jason Kelly,jason.kelly@example.com,"873 Angela Track Apt. 972
+Stephenville, NV 32705"
+17,Mr. Mitchell Saunders,mr..mitchell.saunders@example.com,"USS White
+FPO AE 91058"
+18,Regina Ross,regina.ross@example.com,"91857 Wendy Place
+East Charlesshire, CA 43705"
+19,Mrs. Denise May DDS,mrs..denise.may.dds@example.com,"64590 Kathleen Cove Apt. 736
+Derrickton, AK 05935"
+20,Lisa Boyle,lisa.boyle@example.com,"USNS Russell
+FPO AE 51528"

+ 21 - 0
recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/orders.csv

@@ -0,0 +1,21 @@
+order_id,product_id,customer_id,order_date
+1,13,18,2024-02-15 15:15
+2,19,6,2024-01-03 17:43
+3,12,20,2024-03-11 1:13
+4,7,20,2024-02-04 12:04
+5,14,3,2024-05-02 17:12
+6,17,6,2024-02-12 1:46
+7,20,4,2024-02-26 2:59
+8,4,7,2024-05-02 16:51
+9,11,2,2024-01-04 11:09
+10,6,9,2024-04-09 15:04
+11,3,7,2024-02-21 21:17
+12,6,18,2024-02-21 18:50
+13,17,11,2024-05-02 16:20
+14,11,15,2024-04-20 2:49
+15,16,7,2024-01-18 1:12
+16,16,16,2024-05-03 11:20
+17,14,18,2024-03-26 22:51
+18,20,16,2024-05-07 23:25
+19,1,12,2024-05-20 12:41
+20,20,3,2024-01-17 7:25

+ 21 - 0
recipes/3p_integrations/groq/groq-api-cookbook/function-calling-101-ecommerce/products.csv

@@ -0,0 +1,21 @@
+product_id,name,description,price,stock_quantity
+1,Laptop,High performance laptop with 16GB RAM and 512GB SSD.,753.03,15
+2,Smartphone,Latest model smartphone with a stunning display and great camera.,398.54,59
+3,Headphones,Noise-cancelling over-ear headphones with long battery life.,889.79,97
+4,Monitor,24-inch 1080p monitor with vibrant colors and wide viewing angles.,604.44,98
+5,Keyboard,Mechanical keyboard with customizable RGB lighting.,500.24,52
+6,Mouse,Wireless mouse with ergonomic design and long battery life.,321.98,57
+7,Printer,All-in-one printer with wireless connectivity and high-quality printing.,695.29,32
+8,Tablet,Portable tablet with 10-inch display and powerful processor.,625.75,28
+9,Smartwatch,Stylish smartwatch with fitness tracking and notifications.,952.72,42
+10,Camera,Digital camera with 20MP sensor and 4K video recording.,247.93,99
+11,Speaker,Bluetooth speaker with excellent sound quality and deep bass.,896.4,32
+12,Router,Wi-Fi router with high speed and wide coverage.,976.16,59
+13,External Hard Drive,1TB external hard drive with fast data transfer speeds.,434.46,18
+14,USB Flash Drive,64GB USB flash drive with compact design and reliable storage.,991.09,77
+15,Microphone,Professional microphone with clear sound and adjustable settings.,276.23,30
+16,Webcam,HD webcam with wide-angle lens and built-in microphone.,890.39,13
+17,Drone,Compact drone with HD camera and stable flight controls.,285.93,37
+18,Projector,Portable projector with bright display and multiple connectivity options.,290.22,31
+19,Fitness Tracker,Fitness tracker with heart rate monitor and sleep tracking.,953.65,4
+20,E-Reader,Lightweight e-reader with high-resolution display and long battery life.,132.15,62

+ 8 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/employees.csv

@@ -0,0 +1,8 @@
+employee_id,name,email
+1,Richard Hendricks,richard@piedpiper.com
+2,Erlich Bachman,erlich@aviato.com
+3,Dinesh Chugtai,dinesh@piedpiper.com
+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
+5,Jared Dunn,jared@piedpiper.com
+6,Monica Hall,monica@raviga.com
+7,Gavin Belson,gavin@hooli.com

+ 6 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/data/purchases.csv

@@ -0,0 +1,6 @@
+purchase_id,purchase_date,product_name,employee_id,amount
+1,'2024-02-01',iPhone,1,750
+2,'2024-02-02',Tesla,2,70000
+3,'2024-02-03',Humane pin,3,500
+4,'2024-02-04',iPhone,4,700
+5,'2024-02-05',Tesla,5,75000

Разница между файлами не показана из-за своего большого размера
+ 677 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/json-mode-function-calling-for-sql.ipynb


+ 7 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/employees-without-purchases.yaml

@@ -0,0 +1,7 @@
+description: Employees without a purchase since Feb 1, 2024
+sql: |
+  SELECT employees.name as employees_without_purchases
+  FROM employees.csv AS employees
+  LEFT JOIN purchases.csv AS purchases ON employees.employee_id = purchases.employee_id
+  AND purchases.purchase_date > '2024-02-01'
+  WHERE purchases.purchase_id IS NULL

+ 9 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-expensive-purchase.yaml

@@ -0,0 +1,9 @@
+description: Employee with the most expensive purchase
+sql: |
+  SELECT employees.name AS employee_name,
+        MAX(amount) AS max_purchase_amount
+  FROM purchases.csv AS purchases
+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
+  GROUP BY employees.name
+  ORDER BY max_purchase_amount DESC
+  LIMIT 1

+ 11 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/most-recent-purchases.yaml

@@ -0,0 +1,11 @@
+description: Five most recent purchases
+sql: |
+  SELECT 
+         purchases.purchase_date,
+         purchases.product_name,
+         purchases.amount,
+         employees.name
+  FROM purchases.csv AS purchases
+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
+  ORDER BY purchases.purchase_date DESC
+  LIMIT 5;

+ 6 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-function-calling-for-sql/verified-queries/number-of-teslas.yaml

@@ -0,0 +1,6 @@
+description: Number of Teslas purchased
+sql: |
+  SELECT COUNT(*) as number_of_teslas
+  FROM purchases.csv AS p
+  JOIN employees.csv AS e ON e.employee_id = p.employee_id
+  WHERE p.product_name = 'Tesla'

Разница между файлами не показана из-за своего большого размера
+ 639 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/SDOH-Json-mode.ipynb


Разница между файлами не показана из-за своего большого размера
+ 31 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00456321.txt


Разница между файлами не показана из-за своего большого размера
+ 28 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00567289.txt


Разница между файлами не показана из-за своего большого размера
+ 28 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00678934.txt


+ 32 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00785642.txt

@@ -0,0 +1,32 @@
+**Date:** March 28, 2024
+
+**Patient:** Brian Lee, 55 years old
+
+**MRN:** 00785642
+
+**Chief Complaint:** "I've been having trouble managing my blood sugar levels."
+
+**History of Present Illness:** The patient is a 55-year-old with a known diagnosis of Type 2 Diabetes Mellitus, presenting with difficulty in managing blood sugar levels over the past month. Reports fluctuating blood sugar readings despite adherence to prescribed diet and medication. The patient expresses a desire to avoid any complications associated with poor diabetes management.
+
+**Past Medical History:** Type 2 Diabetes Mellitus, controlled hypertension
+
+**Social History:**
+The patient is a self-employed graphic designer, working from a home office. They describe their work as fulfilling and report a stable income. They own a home in a well-regarded neighborhood, noting its quiet and safe environment. The patient has a supportive spouse and a close circle of friends, often participating in social gatherings and community events.
+
+The patient completed a bachelor's degree in graphic design and continues to take online courses to stay updated in their field. They have reliable transportation, a recent model car, ensuring timely access to healthcare appointments. The patient is an active member of a local walking group, which meets thrice a week for exercise and socialization.
+
+Nutritionally, the patient is mindful of their diet, focusing on low-glycemic foods, and has not faced issues with food security. They have comprehensive health insurance coverage through a private provider, with satisfactory benefits that cover their medical needs, including diabetes management.
+
+**Review of Systems:** Reports consistent adherence to diabetic diet and medication regimen. Denies any episodes of hypoglycemia or diabetic ketoacidosis.
+
+**Physical Examination:**
+- General: Well-nourished and well-kept appearance. Alert and oriented.
+- Vitals: BP 130/80, HR 72, Temp 98.6°F, Resp 14/min
+
+**Assessment/Plan:**
+- Review current diabetes management plan and consider medication adjustments.
+- Recommend continuous glucose monitoring (CGM) to better understand glucose patterns and variability.
+- Encourage continued engagement with community exercise groups and dietary mindfulness.
+- Schedule a follow-up appointment in 3 months or sooner if glucose management issues persist.
+
+**Comments:** The patient demonstrates a proactive approach to managing their diabetes, supported by a stable and healthy social environment. Continued focus on lifestyle modification and close monitoring of blood sugar levels are key to preventing complications.

Разница между файлами не показана из-за своего большого размера
+ 30 - 0
recipes/3p_integrations/groq/groq-api-cookbook/json-mode-social-determinants-of-health/clinical_notes/00893247.txt


Разница между файлами не показана из-за своего большого размера
+ 427 - 0
recipes/3p_integrations/groq/groq-api-cookbook/llama3-stock-market-function-calling/llama3-stock-market-function-calling.ipynb


+ 340 - 0
recipes/3p_integrations/groq/groq-api-cookbook/parallel-tool-use/parallel-tool-use.ipynb

@@ -0,0 +1,340 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "104f2b97-f9bb-4dcc-a4c8-099710768851",
+   "metadata": {},
+   "source": [
+    "# Parallel Tool use"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8dc57b6-2c48-4ee3-bb2c-25441274ed2f",
+   "metadata": {},
+   "source": [
+    "### Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e70814b4",
+   "metadata": {},
+   "source": [
+    "Make sure you have `ipykernel` and `pip` pre-installed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "962ae5e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e21816b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Groq API key configured: gsk_7FdrzM...'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "from groq import Groq\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()\n",
+    "\"Groq API key configured: \" + os.environ[\"GROQ_API_KEY\"][:10] + \"...\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f7c9c55-e925-4cc1-89f2-58237acf14a4",
+   "metadata": {},
+   "source": [
+    "We will use the ```llama3-70b-8192``` model in this demo. Note that you will need a Groq API Key to proceed and can create an account [here](https://console.groq.com/) to generate one for free. Only Llama 3 models support parallel tool use at this time (05/07/2024).\n",
+    "\n",
+    "We recommend using the 70B Llama 3 model, 8B has subpar consistency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0cca781b-1950-4167-b36a-c1099d6b3b00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Groq(api_key=os.getenv(\"GROQ_API_KEY\"))\n",
+    "model = \"llama3-70b-8192\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c23ec2b",
+   "metadata": {},
+   "source": [
+    "Let's define a dummy function we can invoke in our tool use loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f2ce18dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_weather(city: str):\n",
+    "    if city == \"Madrid\":\n",
+    "        return 35\n",
+    "    elif city == \"San Francisco\":\n",
+    "        return 18\n",
+    "    elif city == \"Paris\":\n",
+    "        return 20\n",
+    "    else:\n",
+    "        return 15"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a37e3c92",
+   "metadata": {},
+   "source": [
+    "Now we define our messages and tools and run the completion request."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6b454910-4352-40cc-b9b2-cc79edabd7c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"\"\"You are a helpful assistant.\"\"\"},\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"What is the weather in Paris, Tokyo and Madrid?\",\n",
+    "    },\n",
+    "]\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_weather\",\n",
+    "            \"description\": \"Returns the weather in the given city in degrees Celsius\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The name of the city\",\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"city\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=model, messages=messages, tools=tools, tool_choice=\"auto\", max_tokens=4096\n",
+    ")\n",
+    "\n",
+    "response_message = response.choices[0].message"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25c2838f",
+   "metadata": {},
+   "source": [
+    "# Processing the tool calls\n",
+    "\n",
+    "Now we process the assistant message and construct the required messages to continue the conversation. \n",
+    "\n",
+    "*Including* invoking each tool_call against our actual function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fe623ab9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[\n",
+      "  {\n",
+      "    \"role\": \"system\",\n",
+      "    \"content\": \"You are a helpful assistant.\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"role\": \"user\",\n",
+      "    \"content\": \"What is the weather in Paris, Tokyo and Madrid?\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"role\": \"assistant\",\n",
+      "    \"tool_calls\": [\n",
+      "      {\n",
+      "        \"id\": \"call_5ak8\",\n",
+      "        \"function\": {\n",
+      "          \"name\": \"get_weather\",\n",
+      "          \"arguments\": \"{\\\"city\\\":\\\"Paris\\\"}\"\n",
+      "        },\n",
+      "        \"type\": \"function\"\n",
+      "      },\n",
+      "      {\n",
+      "        \"id\": \"call_zq26\",\n",
+      "        \"function\": {\n",
+      "          \"name\": \"get_weather\",\n",
+      "          \"arguments\": \"{\\\"city\\\":\\\"Tokyo\\\"}\"\n",
+      "        },\n",
+      "        \"type\": \"function\"\n",
+      "      },\n",
+      "      {\n",
+      "        \"id\": \"call_znf3\",\n",
+      "        \"function\": {\n",
+      "          \"name\": \"get_weather\",\n",
+      "          \"arguments\": \"{\\\"city\\\":\\\"Madrid\\\"}\"\n",
+      "        },\n",
+      "        \"type\": \"function\"\n",
+      "      }\n",
+      "    ]\n",
+      "  },\n",
+      "  {\n",
+      "    \"role\": \"tool\",\n",
+      "    \"content\": \"20\",\n",
+      "    \"tool_call_id\": \"call_5ak8\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"role\": \"tool\",\n",
+      "    \"content\": \"15\",\n",
+      "    \"tool_call_id\": \"call_zq26\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"role\": \"tool\",\n",
+      "    \"content\": \"35\",\n",
+      "    \"tool_call_id\": \"call_znf3\"\n",
+      "  }\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "tool_calls = response_message.tool_calls\n",
+    "\n",
+    "messages.append(\n",
+    "    {\n",
+    "        \"role\": \"assistant\",\n",
+    "        \"tool_calls\": [\n",
+    "            {\n",
+    "                \"id\": tool_call.id,\n",
+    "                \"function\": {\n",
+    "                    \"name\": tool_call.function.name,\n",
+    "                    \"arguments\": tool_call.function.arguments,\n",
+    "                },\n",
+    "                \"type\": tool_call.type,\n",
+    "            }\n",
+    "            for tool_call in tool_calls\n",
+    "        ],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "available_functions = {\n",
+    "    \"get_weather\": get_weather,\n",
+    "}\n",
+    "for tool_call in tool_calls:\n",
+    "    function_name = tool_call.function.name\n",
+    "    function_to_call = available_functions[function_name]\n",
+    "    function_args = json.loads(tool_call.function.arguments)\n",
+    "    function_response = function_to_call(**function_args)\n",
+    "\n",
+    "    # Note how we create a separate tool call message for each tool call\n",
+    "    # the model is able to discern the tool call result through the tool_call_id\n",
+    "    messages.append(\n",
+    "        {\n",
+    "            \"role\": \"tool\",\n",
+    "            \"content\": json.dumps(function_response),\n",
+    "            \"tool_call_id\": tool_call.id,\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "print(json.dumps(messages, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1abe981a",
+   "metadata": {},
+   "source": [
+    "Now we run our final completion with multiple tool call results included in the messages array.\n",
+    "\n",
+    "**Note**\n",
+    "\n",
+    "We pass the tool definitions again to help the model understand:\n",
+    "\n",
+    "1. The assistant message with the tool call\n",
+    "2. Interpret the tool results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5f077df3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The weather in Paris is 20°C, in Tokyo is 15°C, and in Madrid is 35°C.\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=model, messages=messages, tools=tools, tool_choice=\"auto\", max_tokens=4096\n",
+    ")\n",
+    "\n",
+    "print(response.choices[0].message.content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 2 - 0
recipes/3p_integrations/groq/groq-api-cookbook/parallel-tool-use/requirements.txt

@@ -0,0 +1,2 @@
+groq
+python-dotenv

Разница между файлами не показана из-за своего большого размера
+ 993 - 0
recipes/3p_integrations/groq/groq-api-cookbook/rag-langchain-presidential-speeches/presidential_speeches.csv


Разница между файлами не показана из-за своего большого размера
+ 664 - 0
recipes/3p_integrations/groq/groq-api-cookbook/rag-langchain-presidential-speeches/rag-langchain-presidential-speeches.ipynb


+ 21 - 0
recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/README.md

@@ -0,0 +1,21 @@
+# Groq LangChain Conversational Chatbot
+
+A simple application that allows users to interact with a conversational chatbot powered by LangChain. The application uses the Groq API to generate responses and leverages LangChain's [ConversationBufferWindowMemory](https://python.langchain.com/v0.1/docs/modules/memory/types/buffer_window/) to maintain a history of the conversation to provide context for the chatbot's responses.
+
+## Features
+
+- **Conversational Interface**: The application provides a conversational interface where users can ask questions or make statements, and the chatbot responds accordingly.
+
+- **Contextual Responses**: The application maintains a history of the conversation, which is used to provide context for the chatbot's responses.
+
+- **LangChain Integration**: The chatbot is powered by the LangChain API, which uses advanced natural language processing techniques to generate human-like responses.
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Chatbot-with-Conversational-Memory-on-LangChain) or run it on the command line with `python main.py`

+ 74 - 0
recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/main.py

@@ -0,0 +1,74 @@
+import os
+from groq import Groq
+
+from langchain.chains import ConversationChain, LLMChain
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    MessagesPlaceholder,
+)
+from langchain_core.messages import SystemMessage
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from langchain_groq import ChatGroq
+from langchain.prompts import PromptTemplate
+
+
+def main():
+    """
+    This function is the main entry point of the application. It sets up the Groq client, the Streamlit interface, and handles the chat interaction.
+    """
+
+    # Get Groq API key
+    groq_api_key = os.environ['GROQ_API_KEY']
+    model = 'llama3-8b-8192'
+    # Initialize Groq Langchain chat object and conversation
+    groq_chat = ChatGroq(
+            groq_api_key=groq_api_key, 
+            model_name=model
+    )
+    
+    print("Hello! I'm your friendly Groq chatbot. I can help answer your questions, provide information, or just chat. I'm also super fast! Let's start our conversation!")
+
+    system_prompt = 'You are a friendly conversational chatbot'
+    conversational_memory_length = 5 # number of previous messages the chatbot will remember during the conversation
+
+    memory = ConversationBufferWindowMemory(k=conversational_memory_length, memory_key="chat_history", return_messages=True)
+
+
+    #chat_history = []
+    while True:
+        user_question = input("Ask a question: ")
+
+        # If the user has asked a question,
+        if user_question:
+
+            # Construct a chat prompt template using various components
+            prompt = ChatPromptTemplate.from_messages(
+                [
+                    SystemMessage(
+                        content=system_prompt
+                    ),  # This is the persistent system prompt that is always included at the start of the chat.
+
+                    MessagesPlaceholder(
+                        variable_name="chat_history"
+                    ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.
+
+                    HumanMessagePromptTemplate.from_template(
+                        "{human_input}"
+                    ),  # This template is where the user's current input will be injected into the prompt.
+                ]
+            )
+
+            # Create a conversation chain using the LangChain LLM (Language Learning Model)
+            conversation = LLMChain(
+                llm=groq_chat,  # The Groq LangChain chat object initialized earlier.
+                prompt=prompt,  # The constructed prompt template.
+                verbose=False,   # TRUE Enables verbose output, which can be useful for debugging.
+                memory=memory,  # The conversational memory object that stores and manages the conversation history.
+            )
+            # The chatbot's answer is generated by sending the full prompt to the Groq API.
+            response = conversation.predict(human_input=user_question)
+            print("Chatbot:", response)
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
recipes/3p_integrations/groq/groq-example-templates/conversational-chatbot-langchain/requirements.txt


+ 23 - 0
recipes/3p_integrations/groq/groq-example-templates/crewai-agents/README.md

@@ -0,0 +1,23 @@
+# CrewAI Machine Learning Assistant
+
+## Overview
+
+The [CrewAI](https://docs.crewai.com/) Machine Learning Assistant is a command line application designed to kickstart your machine learning projects. It leverages a team of AI agents to guide you through the initial steps of defining, assessing, and solving machine learning problems.
+
+## Features
+
+- **Agents**: Utilizes specialized agents to perform tasks such as problem definition, data assessment, model recommendation, and code generation, enhancing the workflow and efficiency of machine learning projects.
+
+- **CrewAI Framework**: Integrates multiple agents into a cohesive framework, enabling seamless interaction and task execution to streamline the machine learning process.
+
+- **LangChain Integration**: Incorporates LangChain to facilitate natural language processing and enhance the interaction between the user and the machine learning assistant.
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/CrewAI-Machine-Learning-Assistant) or run it on the command line with `python main.py`. You can upload a sample .csv to the same directory as `main.py` to give the application a head start on your ML problem. The application will output a Markdown file including python code for your ML use case to the same directory as main.py.

+ 184 - 0
recipes/3p_integrations/groq/groq-example-templates/crewai-agents/main.py

@@ -0,0 +1,184 @@
+import pandas as pd
+import os
+from crewai import Agent, Task, Crew
+from langchain_groq import ChatGroq
+
+
+def main():
+    """
+    Main function to initialize and run the CrewAI Machine Learning Assistant.
+
+    This function sets up a machine learning assistant using the Llama 3 model with the ChatGroq API.
+    It provides a text-based interface for users to define, assess, and solve machine learning problems
+    by interacting with multiple specialized AI agents. The function outputs the results to the console 
+    and writes them to a markdown file.
+
+    Steps:
+    1. Initialize the ChatGroq API with the specified model and API key.
+    2. Display introductory text about the CrewAI Machine Learning Assistant.
+    3. Create and configure four AI agents:
+        - Problem_Definition_Agent: Clarifies the machine learning problem.
+        - Data_Assessment_Agent: Evaluates the quality and suitability of the provided data.
+        - Model_Recommendation_Agent: Suggests suitable machine learning models.
+        - Starter_Code_Generator_Agent: Generates starter Python code for the project.
+    4. Prompt the user to describe their machine learning problem.
+    5. Check if a .csv file is available in the current directory and try to read it as a DataFrame.
+    6. Define tasks for the agents based on user input and data availability.
+    7. Create a Crew instance with the agents and tasks, and run the tasks.
+    8. Print the results and write them to an output markdown file.
+    """
+
+    model = 'llama3-8b-8192'
+
+    llm = ChatGroq(
+            temperature=0, 
+            groq_api_key = os.getenv('GROQ_API_KEY'), 
+            model_name=model
+        )
+
+    print('CrewAI Machine Learning Assistant')
+    multiline_text = """
+    The CrewAI Machine Learning Assistant is designed to guide users through the process of defining, assessing, and solving machine learning problems. It leverages a team of AI agents, each with a specific role, to clarify the problem, evaluate the data, recommend suitable models, and generate starter Python code. Whether you're a seasoned data scientist or a beginner, this application provides valuable insights and a head start in your machine learning projects.
+    """
+
+    print(multiline_text)
+
+
+    Problem_Definition_Agent = Agent(
+        role='Problem_Definition_Agent',
+        goal="""clarify the machine learning problem the user wants to solve, 
+            identifying the type of problem (e.g., classification, regression) and any specific requirements.""",
+        backstory="""You are an expert in understanding and defining machine learning problems. 
+            Your goal is to extract a clear, concise problem statement from the user's input, 
+            ensuring the project starts with a solid foundation.""",
+        verbose=True,
+        allow_delegation=False,
+        llm=llm,
+    )
+
+    Data_Assessment_Agent = Agent(
+        role='Data_Assessment_Agent',
+        goal="""evaluate the data provided by the user, assessing its quality, 
+            suitability for the problem, and suggesting preprocessing steps if necessary.""",
+        backstory="""You specialize in data evaluation and preprocessing. 
+            Your task is to guide the user in preparing their dataset for the machine learning model, 
+            including suggestions for data cleaning and augmentation.""",
+        verbose=True,
+        allow_delegation=False,
+        llm=llm,
+    )
+
+    Model_Recommendation_Agent = Agent(
+        role='Model_Recommendation_Agent',
+        goal="""suggest the most suitable machine learning models based on the problem definition 
+            and data assessment, providing reasons for each recommendation.""",
+        backstory="""As an expert in machine learning algorithms, you recommend models that best fit 
+            the user's problem and data. You provide insights into why certain models may be more effective than others,
+            considering classification vs regression and supervised vs unsupervised frameworks.""",
+        verbose=True,
+        allow_delegation=False,
+        llm=llm,
+    )
+
+
+    Starter_Code_Generator_Agent = Agent(
+        role='Starter_Code_Generator_Agent',
+        goal="""generate starter Python code for the project, including data loading, 
+            model definition, and a basic training loop, based on findings from the problem definitions,
+            data assessment and model recommendation""",
+        backstory="""You are a code wizard, able to generate starter code templates that users 
+            can customize for their projects. Your goal is to give users a head start in their coding efforts.""",
+        verbose=True,
+        allow_delegation=False,
+        llm=llm,
+    )
+
+
+    user_question = input("Describe your ML problem: ")
+    data_upload = False
+    # Check if there is a .csv file in the current directory
+    if any(file.endswith(".csv") for file in os.listdir()):
+        sample_fp = [file for file in os.listdir() if file.endswith(".csv")][0]
+        try:
+            # Attempt to read the uploaded file as a DataFrame
+            df = pd.read_csv(sample_fp).head(5)
+
+            # If successful, set 'data_upload' to True
+            data_upload = True
+
+            # Display the DataFrame in the app
+            print("Data successfully uploaded and read as DataFrame:")
+            print(df)
+        except Exception as e:
+            print(f"Error reading the file: {e}")
+
+    if user_question:
+
+        task_define_problem = Task(
+        description="""Clarify and define the machine learning problem, 
+            including identifying the problem type and specific requirements.
+
+            Here is the user's problem:
+            {ml_problem}
+            """.format(ml_problem=user_question),
+        agent=Problem_Definition_Agent,
+        expected_output="A clear and concise definition of the machine learning problem."
+        )
+
+        if data_upload:
+            task_assess_data = Task(
+                description="""Evaluate the user's data for quality and suitability, 
+                suggesting preprocessing or augmentation steps if needed.
+
+                Here is a sample of the user's data:
+                {df}
+                The file name is called {uploaded_file}
+
+                """.format(df=df.head(),uploaded_file=sample_fp),
+                agent=Data_Assessment_Agent,
+                expected_output="An assessment of the data's quality and suitability, with suggestions for preprocessing or augmentation if necessary."
+            )
+        else:
+            task_assess_data = Task(
+                description="""The user has not uploaded any specific data for this problem,
+                but please go ahead and consider a hypothetical dataset that might be useful
+                for their machine learning problem. 
+                """,
+                agent=Data_Assessment_Agent,
+                expected_output="A hypothetical dataset that might be useful for the user's machine learning problem, along with any necessary preprocessing steps."
+            )
+
+        task_recommend_model = Task(
+        description="""Suggest suitable machine learning models for the defined problem 
+            and assessed data, providing rationale for each suggestion.""",
+        agent=Model_Recommendation_Agent,
+        expected_output="A list of suitable machine learning models for the defined problem and assessed data, along with the rationale for each suggestion."
+        )
+
+
+        task_generate_code = Task(
+        description="""Generate starter Python code tailored to the user's project using the model recommendation agent's recommendation(s), 
+            including snippets for package import, data handling, model definition, and training
+            """,
+        agent=Starter_Code_Generator_Agent,
+        expected_output="Python code snippets for package import, data handling, model definition, and training, tailored to the user's project, plus a brief summary of the problem and model recommendations."
+        )
+
+
+        crew = Crew(
+            agents=[Problem_Definition_Agent, Data_Assessment_Agent, Model_Recommendation_Agent,  Starter_Code_Generator_Agent], 
+            tasks=[task_define_problem, task_assess_data, task_recommend_model,  task_generate_code], 
+            verbose=False
+        )
+
+        result = crew.kickoff()
+
+        print(result)
+
+        with open('output.md', "w") as file:
+            print('\n\nThese results have been exported to output.md')
+            file.write(result)
+
+
+if __name__ == "__main__":
+    main()

+ 3 - 0
recipes/3p_integrations/groq/groq-example-templates/crewai-agents/requirements.txt

@@ -0,0 +1,3 @@
+crewai
+langchain_groq
+pandas

+ 21 - 0
recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/README.md

@@ -0,0 +1,21 @@
+# Groq Quickstart Conversational Chatbot
+
+A simple application that allows users to interact with a conversational chatbot powered by Groq. This application is designed to get users up and running quickly with building a chatbot.
+
+## Features
+
+**Conversational Interface**: Provides a simple interface where users can input text and receive responses from the chatbot.
+
+**Short Responses**: The chatbot replies with very short and concise answers, keeping interactions brief and to the point.
+
+**Groq Integration**: Utilizes the Groq API to generate responses, leveraging the power of the Llama3-70b-8192 model.
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Groq-Quickstart-Conversational-Chatbot) or run it on the command line with `python main.py`.

+ 38 - 0
recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/main.py

@@ -0,0 +1,38 @@
+#set GROQ_API_KEY in the secrets
+
+import os
+from groq import Groq
+
+# Create the Groq client
+client = Groq(
+    api_key=os.environ.get("GROQ_API_KEY")
+)
+
+# Set the system prompt
+system_prompt = {
+    "role": "system",
+    "content":
+    "You are a helpful assistant. You reply with very short answers."
+}
+
+# Initialize the chat history
+chat_history = [system_prompt]
+
+while True:
+  # Get user input from the console
+  user_input = input("You: ")
+
+  # Append the user input to the chat history
+  chat_history.append({"role": "user", "content": user_input})
+
+  response = client.chat.completions.create(model="llama3-70b-8192",
+                                            messages=chat_history,
+                                            max_tokens=100,
+                                            temperature=1.2)
+  # Append the response to the chat history
+  chat_history.append({
+      "role": "assistant",
+      "content": response.choices[0].message.content
+  })
+  # Print the response
+  print("Assistant:", response.choices[0].message.content)

+ 1 - 0
recipes/3p_integrations/groq/groq-example-templates/groq-quickstart-conversational-chatbot/requirements.txt

@@ -0,0 +1 @@
+groq

+ 27 - 0
recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/README.md

@@ -0,0 +1,27 @@
+# 'Groqing the Stock Market' with Llama 3 Function Calling
+
+This is a simple application that leverages the yfinance API to provide insights into stocks and their prices. The application uses the Llama 3 model on Groq in conjunction with Langchain to call functions based on the user prompt.
+
+## Key Functions
+
+- **get_stock_info(symbol, key)**: This function fetches various information about a given stock symbol. The information can be anything from the company's address to its financial ratios. The 'key' parameter specifies the type of information to fetch.
+
+- **get_historical_price(symbol, start_date, end_date)**: This function fetches the historical stock prices for a given symbol from a specified start date to an end date. The returned data is a DataFrame with the date and closing price of the stock.
+
+- **plot_price_over_time(historical_price_dfs)**: This function takes a list of DataFrames (each containing historical price data for a stock) and plots the prices over time using Plotly. The plot is saved to the same directory as the app.
+
+- **call_functions(llm_with_tools, user_prompt)**: This function takes the user's question, invokes the appropriate tool (either get_stock_info or get_historical_price), and generates a response. If the user asked for historical prices, it also calls plot_price_over_time to generate a plot.
+
+## Function Calling
+
+The function calling in this application is handled by the Groq API, abstracted with Langchain. When the user asks a question, the application invokes the appropriate tool with parameters based on the user's question. The tool's output is then used to generate a response.
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Groqing-the-Stock-Market-Function-Calling-with-Llama3) or run it on the command line with `python main.py`.

Разница между файлами не показана из-за своего большого размера
+ 139 - 0
recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/main.py


+ 12 - 0
recipes/3p_integrations/groq/groq-example-templates/groqing-the-stock-market-function-calling-llama3/requirements.txt

@@ -0,0 +1,12 @@
+streamlit
+pandas
+numpy
+groq
+langchain_community
+langchain_groq
+yfinance
+plotly
+langchain_core
+nbformat>=4.2.0
+ipython
+kaleido

+ 21 - 0
recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/README.md

@@ -0,0 +1,21 @@
+# LlamaChat: Conversational Chatbot with LlamaIndex and Llama3
+
+A simple application that allows users to interact with a conversational chatbot powered by the LlamaIndex framework and Meta's Llama3 model. The application uses the Groq API to generate responses and supports different modes of interaction, including simple chat, streaming chat, and customizable chat with system prompts.
+
+##Features
+
+**LlamaIndex**: The application uses LlamaIndex to manage and generate responses, leveraging the power of Groq's language model.
+
+**Simple Chat**: Generates responses based on user input using the Groq API with LlamaIndex.
+
+**Streaming Chat**: Provides real-time streaming responses for user input.
+
+**Customizable Chat**: Allows for chat customization by setting a system prompt to guide the chatbot's responses.
+
+##Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->

+ 46 - 0
recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/main.py

@@ -0,0 +1,46 @@
+from llama_index.llms.groq import Groq
+from llama_index.core.llms import ChatMessage
+
+llm = Groq(model="llama3-8b-8192")
+
+
+system_prompt = 'You are a friendly but highly sarcastic chatbot assistant'
+
+while True:
+    # Get the user's question
+    user_input = input("User: ")
+
+    #user_input = 'write a few paragraphs explaining generative AI to a college freshman'
+
+    ##################################
+    # Simple Chat
+    ##################################
+    print('Simple Chat:\n\n')
+    response = llm.complete(user_input)
+    print(response)
+
+
+    ##################################
+    # Streaming Chat
+    ##################################
+    stream_response = llm.stream_complete(
+        user_input
+    )
+    print('\n\nStreaming Chat:\n')
+    for t in stream_response:
+        print(t.delta, end="")
+
+
+    ##################################
+    # Customizable Chat
+    ##################################
+    messages = [
+        ChatMessage(role="system", content=system_prompt),
+        ChatMessage(role="user", content=user_input),
+    ]
+    print('\n\nChat with System Prompt:\n')
+    response_with_system_prompt = llm.chat(messages)
+
+    print(response_with_system_prompt)
+
+

+ 2 - 0
recipes/3p_integrations/groq/groq-example-templates/llamachat-conversational-chatbot-with-llamaIndex/requirements.txt

@@ -0,0 +1,2 @@
+llama_index
+llama-index-llms-groq

+ 33 - 0
recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/README.md

@@ -0,0 +1,33 @@
+# Presidential Speeches RAG with Pinecone
+
+This repository contains a command line application that allows users to ask questions about US presidental speeches by applying Retrieval-Augmented Generation (RAG) over a Pinecone vector database. The application uses RAG to answer the user's question by retrieving the most relevant presidential speeches and using them to supplant the LLM response.
+
+## Features
+
+- **RAG (Retrieval-Augmented Generation)**: Enhances the generation of responses by integrating retrieval-based methods. This feature allows the system to fetch relevant information from a large corpus of data, providing more accurate and contextually appropriate answers by combining retrieved content with generative capabilities.
+
+- **Vector Databases (Pinecone)**: Integrates with Pinecone to store and manage vector embeddings efficiently. Pinecone's high-performance vector database allows for fast and scalable similarity searches, enabling quick retrieval of relevant data for various machine learning and AI applications.
+
+- **LangChain Integration**: Leverages LangChain to facilitate natural language processing tasks. LangChain enhances the interaction between the user and the system by providing robust language modeling capabilities, ensuring seamless and intuitive communication.
+
+## Code Overview
+
+The main script of the application is [main.py](./main.py). Here's a brief overview of its main functions:
+
+- `get_relevant_excerpts(user_question, docsearch)`: This function takes a user's question and a Pinecone vector store as input, performs a similarity search on the vector store using the user's question, and returns the most relevant excerpts from presidential speeches.
+
+- `get_relevant_excerpts(user_question, docsearch)`: This function takes a user's question and a Pinecone vector store as input, performs a similarity search on the vector store using the user's question, and returns the most relevant excerpts from presidential speeches.
+
+- `presidential_speech_chat_completion(client, model, user_question, relevant_excerpts, additional_context)`: This function takes a Groq client, a pre-trained model, a user's question, relevant excerpts from presidential speeches, and additional context as input. It generates a response to the user's question based on the relevant excerpts and the additional context
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example outside of this Repl. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You would also need your own [Pinecone](https://www.pinecone.io/) index with presidential speech embeddings to run this code locally. You can create a Pinecone API key and one index for a small project for free on their Starter plan, and visit [this Cookbook post](https://github.com/groq/groq-api-cookbook/blob/dan/replit-conversion/presidential-speeches-rag/presidential-speeches-rag.ipynb) for more info on RAG and a guide to uploading these embeddings to a vector database
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Presidential-Speeches-RAG-with-Pinecone) or run it on the command line with `python main.py`.

+ 114 - 0
recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/main.py

@@ -0,0 +1,114 @@
+import pandas as pd
+import numpy as np
+from groq import Groq
+from pinecone import Pinecone
+import os
+
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_pinecone import PineconeVectorStore
+
+
+def get_relevant_excerpts(user_question, docsearch):
+    """
+    This function retrieves the most relevant excerpts from presidential speeches based on the user's question.
+    Parameters:
+    user_question (str): The question asked by the user.
+    docsearch (PineconeVectorStore): The Pinecone vector store containing the presidential speeches.
+    Returns:
+    str: A string containing the most relevant excerpts from presidential speeches.
+    """
+
+    # Perform a similarity search on the Pinecone vector store using the user's question
+    relevent_docs = docsearch.similarity_search(user_question)
+
+    # Extract the page content from the top 3 most relevant documents and join them into a single string
+    relevant_excerpts = '\n\n------------------------------------------------------\n\n'.join([doc.page_content for doc in relevent_docs[:3]])
+
+    return relevant_excerpts
+
+
+def presidential_speech_chat_completion(client, model, user_question, relevant_excerpts):
+    """
+    This function generates a response to the user's question using a pre-trained model.
+    Parameters:
+    client (Groq): The Groq client used to interact with the pre-trained model.
+    model (str): The name of the pre-trained model.
+    user_question (str): The question asked by the user.
+    relevant_excerpts (str): A string containing the most relevant excerpts from presidential speeches.
+    Returns:
+    str: A string containing the response to the user's question.
+    """
+
+    # Define the system prompt
+    system_prompt = '''
+    You are a presidential historian. Given the user's question and relevant excerpts from 
+    presidential speeches, answer the question by including direct quotes from presidential speeches. 
+    When using a quote, site the speech that it was from (ignoring the chunk).
+    '''
+
+    # Generate a response to the user's question using the pre-trained model
+    chat_completion = client.chat.completions.create(
+        messages = [
+            {
+                "role": "system",
+                "content":  system_prompt
+            },
+            {
+                "role": "user",
+                "content": "User Question: " + user_question + "\n\nRelevant Speech Exerpt(s):\n\n" + relevant_excerpts,
+            }
+        ],
+        model = model
+    )
+
+    # Extract the response from the chat completion
+    response = chat_completion.choices[0].message.content
+
+    return response
+
+
+def main():
+    """
+    This is the main function that runs the application. It initializes the Groq client and the SentenceTransformer model,
+    gets user input from the Streamlit interface, retrieves relevant excerpts from presidential speeches based on the user's question,
+    generates a response to the user's question using a pre-trained model, and displays the response.
+    """
+
+    model = 'llama3-8b-8192'
+
+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+
+    # Initialize the Groq client
+    groq_api_key = os.getenv('GROQ_API_KEY')
+    pinecone_api_key=os.getenv('PINECONE_API_KEY')
+    pinecone_index_name = "presidential-speeches"
+    client = Groq(
+        api_key=groq_api_key
+    )
+
+    pc = Pinecone(api_key = pinecone_api_key)
+    docsearch = PineconeVectorStore(index_name=pinecone_index_name, embedding=embedding_function)
+
+    # Display the title and introduction of the application
+    print("Presidential Speeches RAG")
+    multiline_text = """
+    Welcome! Ask questions about U.S. presidents, like "What were George Washington's views on democracy?" or "What did Abraham Lincoln say about national unity?". The app matches your question to relevant excerpts from presidential speeches and generates a response using a pre-trained model.
+    """
+
+    print(multiline_text)
+
+
+    while True:
+        # Get the user's question
+        user_question = input("Ask a question about a US president: ")
+
+        if user_question:
+            pinecone_index_name = "presidential-speeches"
+            relevant_excerpts = get_relevant_excerpts(user_question, docsearch)
+            response = presidential_speech_chat_completion(client, model, user_question, relevant_excerpts)
+            print(response)
+
+
+
+if __name__ == "__main__":
+    main()

+ 8 - 0
recipes/3p_integrations/groq/groq-example-templates/presidential-speeches-rag-with-pinecone/requirements.txt

@@ -0,0 +1,8 @@
+pandas
+numpy
+groq
+langchain_community
+langchain_pinecone
+transformers
+scikit-learn
+sentence-transformers

+ 57 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/README.md

@@ -0,0 +1,57 @@
+# DuckDB Text-to-SQL with JSON Mode
+
+A command line application that allows users to ask questions about their DuckDB data. The application leverages Groq API's JSON mode to generate SQL queries based on the user's questions and execute them on a DuckDB database.
+
+## Features
+
+- **Text-to-SQL**: The application uses natural language processing to convert user questions into SQL queries, making it easy for users to query their data without knowing SQL.
+
+- **JSON mode**: A feature which enables the LLM to respond strictly in a structured JSON output, provided we supply it with the desired format
+
+- **Data Summarization**: After executing a SQL query, the application uses the AI to summarize the resulting data in relation to the user's original question.
+
+## Data
+
+The application queries data from two CSV files located in the `data` folder:
+
+- `employees.csv`: Contains employee data including their ID, full name, and email address.
+
+- `purchases.csv`: Records purchase details including purchase ID, date, associated employee ID, amount, and product name.
+
+## Prompts
+
+The base prompt for the AI is stored in a text file in the `prompts` folder:
+
+- `base_prompt.txt`
+
+A well-crafted system prompt is essential for building a functional Text-to-SQL application. Ours will serve 3 purposes:
+
+1. Provide the metadata schemas for our database tables
+2. Indicate any relevant context or tips for querying the DuckDB language or our database schema specifically
+3. Define our desired JSON output (note that to use JSON mode, we must include 'JSON' in the prompt)
+
+## Functions
+
+- `chat_with_groq()`: Sends a prompt to the Groq API and returns the AI's response.
+- `execute_duckdb_query()`: Executes a SQL query on a DuckDB database and returns the result.
+- `get_summarization()`: Generates a prompt for the AI to summarize the data resulting from a SQL query.
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Building-a-Text-to-SQL-app-with-Groqs-JSON-mode) or run it on the command line with `python main.py`.
+
+## Customizing with Your Own Data
+
+This application is designed to be flexible and can be easily customized to work with your own data. If you want to use your own data, follow these steps:
+
+1. **Replace the CSV files**: The application queries data from two CSV files located in the `data` folder: `employees.csv` and `purchases.csv`. Replace these files with your own CSV files.
+
+2. **Modify the base prompt**: The base prompt for the AI, stored in the `prompts` folder as `base_prompt.txt`, contains specific information about the data metadata. Modify this prompt to match the structure and content of your own data. Make sure to accurately describe the tables, columns, and any specific rules or tips for querying your dataset.
+
+By following these steps, you can tailor the DuckDB Query Generator to your own data and use cases. Feel free to experiment and build off this repository to create your own powerful data querying applications.

+ 8 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/data/employees.csv

@@ -0,0 +1,8 @@
+employee_id,name,email
+1,Richard Hendricks,richard@piedpiper.com
+2,Erlich Bachman,erlich@aviato.com
+3,Dinesh Chugtai,dinesh@piedpiper.com
+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
+5,Jared Dunn,jared@piedpiper.com
+6,Monica Hall,monica@raviga.com
+7,Gavin Belson,gavin@hooli.com

+ 6 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/data/purchases.csv

@@ -0,0 +1,6 @@
+purchase_id,purchase_date,product_name,employee_id,amount
+1,'2024-02-01',iPhone,1,750
+2,'2024-02-02',Tesla,2,70000
+3,'2024-02-03',Humane pin,3,500
+4,'2024-02-04',iPhone,4,700
+5,'2024-02-05',Tesla,5,75000

+ 145 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/main.py

@@ -0,0 +1,145 @@
+import os
+from groq import Groq
+import json
+import duckdb
+import sqlparse
+
+def chat_with_groq(client, prompt, model, response_format):
+    """
+    This function sends a prompt to the Groq API and retrieves the AI's response.
+
+    Parameters:
+    client (Groq): The Groq API client.
+    prompt (str): The prompt to send to the AI.
+    model (str): The AI model to use for the response.
+    response_format (dict): The format of the response. 
+        If response_format is a dictionary with {"type": "json_object"}, it configures JSON mode.
+
+    Returns:
+    str: The content of the AI's response.
+    """
+    
+    completion = client.chat.completions.create(
+    model=model,
+    messages=[
+        {
+            "role": "user",
+            "content": prompt
+        }
+    ],
+    response_format=response_format
+    )
+
+    return completion.choices[0].message.content
+
+
+def execute_duckdb_query(query):
+    """
+    This function executes a SQL query on a DuckDB database and returns the result.
+
+    Parameters:
+    query (str): The SQL query to execute.
+
+    Returns:
+    DataFrame: The result of the query as a pandas DataFrame.
+    """
+    original_cwd = os.getcwd()
+    os.chdir('data')
+
+    try:
+        conn = duckdb.connect(database=':memory:', read_only=False)
+        query_result = conn.execute(query).fetchdf().reset_index(drop=True)
+    finally:
+        os.chdir(original_cwd)
+
+    return query_result
+
+
+def get_summarization(client, user_question, df, model):
+    """
+    This function generates a summarization prompt based on the user's question and the resulting data. 
+    It then sends this summarization prompt to the Groq API and retrieves the AI's response.
+
+    Parameters:
+    client (Groqcloud): The Groq API client.
+    user_question (str): The user's question.
+    df (DataFrame): The DataFrame resulting from the SQL query.
+    model (str): The AI model to use for the response.
+    
+    Returns:
+    str: The content of the AI's response to the summarization prompt.
+    """
+    prompt = '''
+    A user asked the following question pertaining to local database tables:
+    
+    {user_question}
+    
+    To answer the question, a dataframe was returned:
+    
+    Dataframe:
+    {df}
+    
+    In a few sentences, summarize the data in the table as it pertains to the original user question. Avoid qualifiers like "based on the data" and do not comment on the structure or metadata of the table itself
+    '''.format(user_question = user_question, df = df)
+    
+    # Response format is set to 'None'
+    return chat_with_groq(client,prompt,model,None)
+
+def main():
+    """
+    The main function of the application. It handles user input, controls the flow of the application, 
+    and initiates a conversation in the command line.
+    """
+
+    model = "llama3-70b-8192"
+
+    # Get the Groq API key and create a Groq client
+    groq_api_key = os.getenv('GROQ_API_KEY')
+    client = Groq(
+        api_key=groq_api_key
+    )
+
+    print("Welcome to the DuckDB Query Generator!")
+    print("You can ask questions about the data in the 'employees.csv' and 'purchases.csv' files.")
+
+    # Load the base prompt
+    with open('prompts/base_prompt.txt', 'r') as file:
+        base_prompt = file.read()
+
+    while True:
+        # Get the user's question
+        user_question = input("Ask a question: ")
+
+        if user_question:
+            # Generate the full prompt for the AI
+            full_prompt = base_prompt.format(user_question=user_question)
+
+            # Get the AI's response. Call with '{"type": "json_object"}' to use JSON mode
+            llm_response = chat_with_groq(client, full_prompt, model, {"type": "json_object"})
+
+            result_json = json.loads(llm_response)
+            if 'sql' in result_json:
+                sql_query = result_json['sql']
+                results_df = execute_duckdb_query(sql_query)
+
+                formatted_sql_query = sqlparse.format(sql_query, reindent=True, keyword_case='upper')
+
+                print("```sql\n" + formatted_sql_query + "\n```")
+                print(results_df.to_markdown(index=False))
+
+                summarization = get_summarization(client,user_question,results_df,model)
+                print(summarization.replace('$','\\$'))
+            elif 'error' in result_json:
+                print("ERROR:", 'Could not generate valid SQL for this question')
+                print(result_json['error'])
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
+
+
+

+ 42 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/prompts/base_prompt.txt

@@ -0,0 +1,42 @@
+You are Groq Advisor, and you are tasked with generating SQL queries for DuckDB based on user questions about data stored in two tables derived from CSV files:
+
+Table: employees.csv
+Columns:
+employee_id (INTEGER): A unique identifier for each employee.
+name (VARCHAR): The full name of the employee.
+email (VARCHAR): employee's email address
+
+Table: purchases.csv
+Columns:
+purchase_id (INTEGER): A unique identifier for each purchase.
+purchase_date (DATE): Date of purchase
+employee_id (INTEGER): References the employee_id from the employees table, indicating which employee made the purchase.
+amount (FLOAT): The monetary value of the purchase.
+product_name (STRING): The name of the product purchased
+
+Given a user's question about this data, write a valid DuckDB SQL query that accurately extracts or calculates the requested information from these tables and adheres to SQL best practices for DuckDB, optimizing for readability and performance where applicable.
+
+Here are some tips for writing DuckDB queries:
+* DuckDB syntax requires querying from the .csv file itself, i.e. employees.csv and purchases.csv. For example: SELECT * FROM employees.csv as employees
+* All tables referenced MUST be aliased
+* DuckDB does not implicitly include a GROUP BY clause
+* CURRENT_DATE gets today's date
+* Aggregated fields like COUNT(*) must be appropriately named
+
+And some rules for querying the dataset:
+* Never include employee_id in the output - show employee name instead
+
+Also note that:
+* Valid values for product_name include 'Tesla','iPhone' and 'Humane pin'
+
+
+Question:
+--------
+{user_question}
+--------
+Reminder: Generate a DuckDB SQL to answer to the question:
+* respond as a valid JSON Document
+* [Best] If the question can be answered with the available tables: {{"sql": <sql here>}} 
+* If the question cannot be answered with the available tables: {{"error": <explanation here>}}
+* Ensure that the entire output is returned on only one single line
+* Keep your query as simple and straightforward as possible; do not use subqueries

+ 4 - 0
recipes/3p_integrations/groq/groq-example-templates/text-to-sql-json-mode/requirements.txt

@@ -0,0 +1,4 @@
+duckdb
+groq
+sqlparse
+pandas

+ 53 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/README.md

@@ -0,0 +1,53 @@
+# Executing Verified Queries with Function Calling
+
+A command line application that allows users to ask questions about their DuckDB data using the Groq API. The application uses function calling to find the most similar pre-verified query to the user's question, execute it against the data, and return the results.
+
+## Features
+
+- **Function Calling**: The application uses function calling to match the user's question to the most relevant pre-verified SQL query.
+
+- **SQL Execution**: The application executes the selected SQL query on a DuckDB database and displays the result.
+
+## Functions
+
+- `get_verified_queries(directory_path)`: Reads YAML files from the specified directory and loads the verified SQL queries and their descriptions.
+
+- `execute_duckdb_query_function_calling(query_name, verified_queries_dict)`: Executes the provided SQL query using DuckDB and returns the result as a DataFrame.
+
+## Data
+
+The application queries data from CSV files located in the data folder:
+
+- `employees.csv`: Contains employee data including their ID, full name, and email address.
+
+- `purchases.csv`: Records purchase details including purchase ID, date, associated employee ID, amount, and product name.
+
+## Verified Queries
+
+The verified SQL queries and their descriptions are stored in YAML files located in the `verified-queries` folder. Descriptions are used to semantically map prompts to queries:
+
+- `most-recent-purchases.yaml`: Returns the 5 most recent purchases
+
+- `most-expensive-purchase.yaml`: Finds the most expensive purchases
+
+- `number-of-teslas.yaml`: Counts the number of Teslas purchased
+
+- `employees-without-purchases.yaml`: Gets employees without any recent purchases
+
+## Usage
+
+<!-- markdown-link-check-disable -->
+
+You will need to store a valid Groq API Key as a secret to proceed with this example. You can generate one for free [here](https://console.groq.com/keys).
+
+<!-- markdown-link-check-enable -->
+
+You can [fork and run this application on Replit](https://replit.com/@GroqCloud/Execute-Verified-SQL-Queries-with-Function-Calling) or run it on the command line with `python main.py`.
+
+## Customizing with Your Own Data
+
+This application is designed to be flexible and can be easily customized to work with your own data. If you want to use your own data, follow these steps:
+
+1. **Replace the CSV files**: The application queries data from CSV files located in the `data` folder. Replace these files with your own CSV files.
+
+2. **Modify the verified queries**: The verified SQL queries and their descriptions are stored in YAML files located in the `verified-queries` folder. Replace these files with your own verified SQL queries and descriptions.

+ 8 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/data/employees.csv

@@ -0,0 +1,8 @@
+employee_id,name,email
+1,Richard Hendricks,richard@piedpiper.com
+2,Erlich Bachman,erlich@aviato.com
+3,Dinesh Chugtai,dinesh@piedpiper.com
+4,Bertram Gilfoyle,gilfoyle@piedpiper.com
+5,Jared Dunn,jared@piedpiper.com
+6,Monica Hall,monica@raviga.com
+7,Gavin Belson,gavin@hooli.com

+ 6 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/data/purchases.csv

@@ -0,0 +1,6 @@
+purchase_id,purchase_date,product_name,employee_id,amount
+1,'2024-02-01',iPhone,1,750
+2,'2024-02-02',Tesla,2,70000
+3,'2024-02-03',Humane pin,3,500
+4,'2024-02-04',iPhone,4,700
+5,'2024-02-05',Tesla,5,75000

+ 158 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/main.py

@@ -0,0 +1,158 @@
+import os
+from groq import Groq
+import duckdb
+import yaml
+import glob
+import json
+
+def get_verified_queries(directory_path):
+    """
+    Reads YAML files from the specified directory, loads the verified SQL queries and their descriptions,
+    and stores them in a dictionary.
+
+    Parameters:
+        directory_path (str): The path to the directory containing the YAML files with verified queries.
+
+    Returns:
+        dict: A dictionary where the keys are the names of the YAML files (without the directory path and file extension)
+              and the values are the parsed content of the YAML files.
+    """
+    verified_queries_yaml_files = glob.glob(os.path.join(directory_path, '*.yaml'))
+    verified_queries_dict = {}
+    for file in verified_queries_yaml_files:
+        with open(file, 'r') as stream:
+            try:
+                file_name = file[len(directory_path):-5]
+                verified_queries_dict[file_name] = yaml.safe_load(stream)
+            except yaml.YAMLError as exc:
+                continue
+        
+    return verified_queries_dict
+
+
+def execute_duckdb_query_function_calling(query_name,verified_queries_dict):
+    """
+    Executes a SQL query from the verified queries dictionary using DuckDB and returns the result as a DataFrame.
+
+    Parameters:
+        query_name (str): The name of the query to be executed, corresponding to a key in the verified queries dictionary.
+        verified_queries_dict (dict): A dictionary containing verified queries, where the keys are query names and the values
+                                      are dictionaries with query details including the SQL statement.
+
+    Returns:
+        pandas.DataFrame: The result of the executed query as a DataFrame.
+    """
+    
+    original_cwd = os.getcwd()
+    os.chdir('data')
+
+    query = verified_queries_dict[query_name]['sql']
+    
+    try:
+        conn = duckdb.connect(database=':memory:', read_only=False)
+        query_result = conn.execute(query).fetchdf().reset_index(drop=True)
+    finally:
+        os.chdir(original_cwd)
+
+    return query_result
+
+
+model = "llama3-8b-8192"
+
+# Initialize the Groq client
+groq_api_key = os.getenv('GROQ_API_KEY')
+client = Groq(
+    api_key=groq_api_key
+)
+
+directory_path = 'verified-queries/'
+verified_queries_dict = get_verified_queries(directory_path)
+
+# Display the title and introduction of the application
+multiline_text = """
+Welcome! Ask questions about employee data or purchase details, like "Show the 5 most recent purchases" or "What was the most expensive purchase?". The app matches your question to pre-verified SQL queries for accurate results.
+"""
+
+print(multiline_text)
+
+    
+while True:
+    # Get user input from the console
+    user_input = input("You: ")
+
+    
+    #Simplify verified_queries_dict to just show query name and description
+    query_description_mapping = {key: subdict['description'] for key, subdict in verified_queries_dict.items()}
+    
+    # Step 1: send the conversation and available functions to the model
+    # Define the messages to be sent to the Groq API
+    messages = [
+        {
+            "role": "system",
+            "content": '''You are a function calling LLM that uses the data extracted from the execute_duckdb_query_function_calling function to answer questions around a DuckDB dataset.
+
+            Extract the query_name parameter from this mapping by finding the one whose description best matches the user's question: 
+            {query_description_mapping}
+            '''.format(query_description_mapping=query_description_mapping)
+        },
+        {
+            "role": "user",
+            "content": user_input,
+        }
+    ]
+
+    # Define the tool (function) to be used by the Groq API
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "execute_duckdb_query_function_calling",
+                "description": "Executes a verified DuckDB SQL Query",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query_name": {
+                            "type": "string",
+                            "description": "The name of the verified query (i.e. 'most-recent-purchases')",
+                        }
+                    },
+                    "required": ["query_name"],
+                },
+            },
+        }
+    ]
+
+    # Send the conversation and available functions to the Groq API
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",  
+        max_tokens=4096
+    )
+
+    # Extract the response message and any tool calls from the response
+    response_message = response.choices[0].message
+    tool_calls = response_message.tool_calls
+
+    # Define a dictionary of available functions
+    available_functions = {
+        "execute_duckdb_query_function_calling": execute_duckdb_query_function_calling,
+    }
+
+    # Iterate over the tool calls in the response
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name  # Get the function name
+        function_to_call = available_functions[function_name]  # Get the function to call
+        function_args = json.loads(tool_call.function.arguments)  # Parse the function arguments
+        print('Query found: ', function_args.get("query_name"))
+        
+        # Call the function with the provided arguments
+        function_response = function_to_call(
+            query_name=function_args.get("query_name"),
+            verified_queries_dict=verified_queries_dict
+        )
+
+    # Print the function response (query result)
+    print(function_response)
+

+ 9 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/requirements.txt

@@ -0,0 +1,9 @@
+groq
+sentence-transformers
+langchain_community
+scikit-learn
+numpy
+duckdb
+pyyaml
+sqlparse
+tabulate

+ 7 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/employees-without-purchases.yaml

@@ -0,0 +1,7 @@
+description: Employees without a purchase since Feb 1, 2024
+sql: |
+  SELECT employees.name as employees_without_purchases
+  FROM employees.csv AS employees
+  LEFT JOIN purchases.csv AS purchases ON employees.employee_id = purchases.employee_id
+  AND purchases.purchase_date > '2024-02-01'
+  WHERE purchases.purchase_id IS NULL

+ 9 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-expensive-purchase.yaml

@@ -0,0 +1,9 @@
+description: Employee with the most expensive purchase
+sql: |
+  SELECT employees.name AS employee_name,
+        MAX(amount) AS max_purchase_amount
+  FROM purchases.csv AS purchases
+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
+  GROUP BY employees.name
+  ORDER BY max_purchase_amount DESC
+  LIMIT 1

+ 9 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/most-recent-purchases.yaml

@@ -0,0 +1,9 @@
+description: Five most recent purchases
+sql: |
+  SELECT purchases.product_name,
+         purchases.amount,
+         employees.name
+  FROM purchases.csv AS purchases
+  JOIN employees.csv AS employees ON purchases.employee_id = employees.employee_id
+  ORDER BY purchases.purchase_date DESC
+  LIMIT 5;

+ 6 - 0
recipes/3p_integrations/groq/groq-example-templates/verified-sql-function-calling/verified-queries/number-of-teslas.yaml

@@ -0,0 +1,6 @@
+description: Number of Teslas purchased
+sql: |
+  SELECT COUNT(*) as number_of_teslas
+  FROM purchases.csv AS p
+  JOIN employees.csv AS e ON e.employee_id = p.employee_id
+  WHERE p.product_name = 'Tesla'

Разница между файлами не показана из-за своего большого размера
+ 1708 - 0
recipes/3p_integrations/groq/llama3_cookbook_groq.ipynb


+ 26 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/README.md

@@ -0,0 +1,26 @@
+# Tune Llama 3 for text-to-SQL and improve accuracy from 30% to 95%
+
+This repo and notebook `meta_lamini.ipynb` demonstrate how to tune Llama 3 to generate valid SQL queries and improve accuracy from 30% to 95%.
+
+In this notebook we'll be using Lamini, and more specifically, Lamini Memory Tuning.
+
+Lamini is an integrated platform for LLM inference and tuning for the enterprise. Lamini Memory Tuning is a new tool you can use to embed facts into LLMs that improves factual accuracy and reduces hallucinations. Inspired by information retrieval, this method has set a new standard of accuracy for LLMs with less developer effort.
+
+Learn more about Lamini Memory Tuning: https://www.lamini.ai/blog/lamini-memory-tuning
+
+Please head over to https://app.lamini.ai/account to get your free api key.
+
+You can authenticate by writing the following to a file `~/.lamini/configure.yaml`
+
+```
+production:
+    key: <YOUR-LAMINI-API-KEY>
+```
+
+This tuning tutorial uses the `nba_roster` sqlite database to tune a Llama 3 model.
+
+## Additional resources
+
+▫️ Fortune 500 case study: http://www.lamini.ai/blog/llm-text-to-sql <br>
+▫️ Technical paper: https://github.com/lamini-ai/Lamini-Memory-Tuning/blob/main/research-paper.pdf <br>
+▫️ Model weights: https://huggingface.co/engineering-lamini/lamini-1-random

BIN
recipes/3p_integrations/lamini/text2sql_memory_tuning/assets/manual_filtering.png


BIN
recipes/3p_integrations/lamini/text2sql_memory_tuning/assets/website.png


Разница между файлами не показана из-за своего большого размера
+ 40 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/gold-test-set-v2.jsonl


+ 20 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/gold-test-set.jsonl

@@ -0,0 +1,20 @@
+{"question": "What is the 99th percentile salary in the NBA?", "answer": "46741590", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "answer": "13932008", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "answer": "2413304", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "answer": "215", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average weight in the NBA?", "answer": "214.98", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "answer": "6.58333333333333", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average height in the NBA?", "answer": "6.54986111111111", "sql": "select AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster;"}
+{"question": "Can you tell me how many players are in the NBA?", "answer": "600", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "answer": "The highest paid players are Nikola Jokic (C), Paul George (F), Norman Powell (G), Kevin Durant (PF), Stephen Curry (PG), LeBron James (SF), Bradley Beal (SG).", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "answer": "No, Jalen Johnson is 21 years old", "sql" : "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "answer": "Spencer Dinwiddie, Dorian Finney-Smith, Royce O'Neale", "sql" : "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "answer": "Ja Morant", "sql" : "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "answer": "Darius Garland", "sql" : "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "answer": "Dereck Lively II", "sql" : "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "answer": "$18,833,712", "sql" : "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "answer": "24", "sql" : "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "answer": "25", "sql" : "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What's the median age of the Miami Heat?", "answer": "26", "sql" : "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')/2;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "answer": "Golden State Warriors, Milwaukee Bucks, Miami Heat, LA Clippers, Phoenix Suns", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "answer": "$10948045", "sql": "select avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary from nba_roster where POS = 'PF';"}

+ 220 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/archive/generated_queries_large_filtered_cleaned.jsonl

@@ -0,0 +1,220 @@
+{"question": "What is the average height of NBA players who are 25 years old or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS average_height FROM nba_roster WHERE CAST(AGE AS INTEGER) >= 25;"}
+{"question": "Which team has the most players who attended the University of Michigan", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE = 'Michigan' GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster;"}
+{"question": "What position has the most players aged 30 or older in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE AGE >= 30 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the age of the oldest 25% of the players in the NBA", "sql": "SELECT CAST(AGE AS INTEGER) AS percentile FROM nba_roster ORDER BY percentile LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster) * 75/100 - 1;"}
+{"question": "What is the average age of players at each position in the NBA", "sql": "SELECT POS, AVG(AGE) AS avg_age FROM nba_roster GROUP BY POS;"}
+{"question": "What is the position with the highest average salary in the NBA", "sql": "SELECT POS, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster GROUP BY POS ORDER BY avg_salary DESC LIMIT 1;"}
+{"question": "What is the average age of the youngest players in the NBA", "sql": "SELECT AVG(AGE) as avg_age FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What is the team with the highest average salary in the NBA", "sql": "SELECT TEAM, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY TEAM ORDER BY avg_salary DESC LIMIT 1;"}
+{"question": "Who are the top 5 most valuable players in the NBA, considering both their salary and jersey number", "sql": "SELECT name, (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) + CAST(Jersey AS INTEGER)) AS total_value, POS FROM nba_roster WHERE SALARY!= '--' ORDER BY total_value DESC LIMIT 5;"}
+{"question": "Which three teams in the NBA have the highest average salary", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 3;"}
+{"question": "How many players in the NBA are more than 5 years older than the average age of all players", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5;"}
+{"question": "What is the position with the oldest average age in the NBA", "sql": "SELECT POS, AVG(AGE) as avg_age FROM nba_roster GROUP BY POS ORDER BY avg_age DESC LIMIT 1;"}
+{"question": "Which 10 teams in the NBA have the oldest average age among their players", "sql": "SELECT Team, AVG(AGE) AS avg_age FROM nba_roster GROUP BY Team ORDER BY avg_age DESC LIMIT 10;"}
+{"question": "Who is the tallest player in the NBA", "sql": "SELECT NAME, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster GROUP BY NAME ORDER BY height DESC LIMIT 1;"}
+{"question": "Who are the top 5 highest-paid players in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5;"}
+{"question": "How many players in the NBA are older than 10 years old", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE > 10;"}
+{"question": "What are the top 3 colleges with the highest average salaries for their NBA players", "sql": "SELECT COLLEGE, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY average_salary DESC LIMIT 3;"}
+{"question": "What is the 75th percentile salary in the NBA", "sql": "SELECT (SELECT CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as percentile FROM nba_roster WHERE SALARY!= '--' ORDER BY percentile ASC LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE SALARY!= '--')*75/100-1) as seventy_fifth_percentile_salary;"}
+{"question": "What is the average age of players on each NBA team", "sql": "SELECT TEAM, AVG(AGE) as average_age FROM nba_roster GROUP BY TEAM ORDER BY average_age;"}
+{"question": "What is the average age of the players on the Toronto Raptors", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the age range of players on each team in the NBA", "sql": "SELECT team, MIN(AGE) as youngest_player, MAX(AGE) as oldest_player FROM nba_roster GROUP BY team;"}
+{"question": "What are the min and max salaries for each team", "sql": "SELECT MIN(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as min_salary, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary, team FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY min_salary DESC, max_salary DESC;"}
+{"question": "What is the name of the player who attended the college with the longest name", "sql": "SELECT NAME, COLLEGE FROM nba_roster WHERE COLLEGE!= '--' ORDER BY LENGTH(COLLEGE) DESC LIMIT 1;"}
+{"question": "What is the number of players on each team in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team;"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS frequency FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "How many Boston Celtics players did not attend college", "sql": "SELECT COUNT(*) as count FROM nba_roster WHERE team='Boston Celtics' AND COLLEGE!='--';"}
+{"question": "What is the team with the highest average age in the NBA", "sql": "SELECT AVG(AGE) as average_age, TEAM FROM nba_roster GROUP BY TEAM ORDER BY average_age DESC LIMIT 1;"}
+{"question": "What is the average salary of all players in the NBA, excluding those with a salary of '--'", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the average salary for players of each age group in the NBA, excluding those with unknown salaries", "sql": "SELECT AVG(AGE) AS avg_age, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY AGE ORDER BY avg_age;"}
+{"question": "Who is the player with the highest jersey number in the NBA", "sql": "SELECT NAME, JERSEY FROM nba_roster WHERE JERSEY!= 'NA' ORDER BY JERSEY DESC LIMIT 1;"}
+{"question": "What is the number of players on the Toronto Raptors", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the average age of all NBA players with a known salary", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the position with the highest average age among players between the ages of 22 and 25", "sql": "SELECT AVG(AGE) AS avg_age, POS FROM nba_roster WHERE AGE BETWEEN 22 AND 25 GROUP BY POS ORDER BY avg_age DESC LIMIT 1;"}
+{"question": "What are the top 5 positions in the NBA with the highest average salary", "sql": "SELECT POS, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster GROUP BY POS ORDER BY avg_salary DESC LIMIT 5;"}
+{"question": "What are the top 5 highest-paid players in the NBA", "sql": "SELECT * FROM nba_roster ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5;"}
+{"question": "Which player has the highest average salary in the NBA", "sql": "SELECT name, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY name ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "Which team has the tallest players on average", "sql": "SELECT TEAM, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster GROUP BY TEAM ORDER BY average_height DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA who has attended a college with an unknown college affiliation", "sql": "SELECT NAME FROM nba_roster WHERE SALARY!= '--' AND COLLEGE = '--' ORDER BY CAST(SUBSTR(SALARY, 2) as INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average age and salary for each position in the NBA", "sql": "SELECT POS, AVG(AGE) as avg_age, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster GROUP BY POS;"}
+{"question": "What is the number of unique colleges represented in the NBA", "sql": "SELECT COUNT(DISTINCT COLLEGE) FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "Which team has the oldest average age among all NBA teams", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 1;"}
+{"question": "What is the highest-paid player on the Los Angeles Lakers", "sql": "SELECT salary, name FROM nba_roster WHERE team='Los Angeles Lakers' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',', '') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which NBA team has the most players from the University of Michigan", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE='Michigan' GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC;"}
+{"question": "What are the top 5 teams with the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster GROUP BY team ORDER BY average_salary DESC LIMIT 5;"}
+{"question": "How many NBA players attended a college other than '--'", "sql": "SELECT COUNT(*) FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "Who is the highest-paid player on the Memphis Grizzlies", "sql": "select name, team, salary from nba_roster where team='Memphis Grizzlies' and SALARY!='--' order by CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) desc limit 1;"}
+{"question": "Which team has the highest average salary", "sql": "SELECT Team, AVG(CAST(SUBSTR(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY Team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "What college has the highest average age of its alumni in the NBA", "sql": "SELECT NAME, AVG(AGE) as average_age FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY average_age DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA who has attended college", "sql": "SELECT NAME FROM nba_roster WHERE COLLEGE!= '--' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA who is older than 25 years old", "sql": "SELECT name, salary FROM nba_roster WHERE AGE > 25 AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average salary for each age group in the NBA", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary, AGE as age_group FROM nba_roster GROUP BY AGE;"}
+{"question": "What is the most common age and position combination in the NBA", "sql": "SELECT AGE, POS, COUNT(*) AS count FROM nba_roster GROUP BY AGE, POS ORDER BY count DESC;"}
+{"question": "Who are the top 5 players with the highest jersey numbers in the NBA", "sql": "SELECT NAME, Jersey FROM nba_roster WHERE Jersey IN (SELECT Jersey FROM nba_roster ORDER BY CAST(CAST(Jersey AS INTEGER) AS INTEGER) DESC LIMIT 5);"}
+{"question": "What is the average height of players in the NBA who are 25 years old or younger", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) AS avg_height FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What are the top 5 highest-paid players in each position in the NBA", "sql": "WITH ranked_positions AS (SELECT *, DENSE_RANK() OVER (PARTITION BY POS ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC) AS rank FROM nba_roster) SELECT * FROM ranked_positions WHERE rank <= 5;"}
+{"question": "How many players in the NBA are older than 25 years old", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE > 25;"}
+{"question": "What is the most common position for players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE AGE <= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Who is the player with the highest jersey number on the Golden State Warriors", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Golden State Warriors' AND CAST(Jersey AS INTEGER) = (SELECT MAX(CAST(Jersey AS INTEGER)) FROM nba_roster WHERE TEAM = 'Golden State Warriors');"}
+{"question": "Which five teams in the NBA have the largest rosters", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC LIMIT 5;"}
+{"question": "What is the average salary for each position in the NBA, and which position has the highest average salary", "sql": "SELECT POS, AVG(CAST(SUBSTR(SALARY, 2) AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS ORDER BY avg_salary DESC;"}
+{"question": "Which team has the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "Who is the oldest player in the NBA, on average, among those with known salaries", "sql": "SELECT NAME, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY NAME ORDER BY avg_age DESC LIMIT 1;"}
+{"question": "What is the total salary of all players in the NBA who are 25 years old or younger", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE AGE <= 25;"}
+{"question": "Who is the second-highest paid player on the Memphis Grizzlies", "sql": "select name, team, salary from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1 OFFSET 1;"}
+{"question": "Who are the top 3 highest-paid players in the NBA", "sql": "SELECT * FROM (SELECT *, ROW_NUMBER() OVER (ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC) as row_num FROM nba_roster WHERE SALARY!= '--') AS subquery WHERE row_num <= 3;"}
+{"question": "What is the average age of players for each team in the NBA", "sql": "SELECT team, AVG(AGE) AS avg_age FROM nba_roster GROUP BY team;"}
+{"question": "How many Boston Celtics players have a salary greater than $5,000,000", "sql": "SELECT COUNT(*) as count FROM nba_roster WHERE team='Boston Celtics' AND CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) > 5000000;"}
+{"question": "What is the average age of the players in the NBA roster", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster;"}
+{"question": "Who are the top 3 highest-paid players at each position in the NBA", "sql": "WITH ranked_positions AS (SELECT *, DENSE_RANK() OVER (PARTITION BY POS ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC) AS rank FROM nba_roster) SELECT * FROM ranked_positions WHERE rank <= 3;"}
+{"question": "Who is the oldest player on the Toronto Raptors", "sql": "SELECT name, age FROM nba_roster WHERE team='Toronto Raptors' ORDER BY age DESC LIMIT 1;"}
+{"question": "Which team has the oldest average age in the NBA", "sql": "SELECT Team, AVG(AGE) AS Average_Age FROM nba_roster GROUP BY Team ORDER BY Average_Age DESC LIMIT 1;"}
+{"question": "What are the positions with the most players under the age of 25", "sql": "SELECT pos, COUNT(*) as num_players FROM nba_roster WHERE age < 25 GROUP BY pos;"}
+{"question": "Who are the top 3 players in the NBA roster with the highest jersey numbers", "sql": "SELECT NAME, JERSEY FROM nba_roster ORDER BY JERSEY DESC LIMIT 3;"}
+{"question": "What is the average height of the youngest players in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE age <= 25;"}
+{"question": "What is the oldest player in the NBA", "sql": "SELECT NAME FROM nba_roster WHERE AGE = (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What are the top 5 teams with the highest average salaries in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 5;"}
+{"question": "What is the highest-paid player on the same team as a Toronto Raptors player", "sql": "SELECT name, team, salary FROM nba_roster WHERE team IN (SELECT team FROM nba_roster WHERE name IN (SELECT name FROM nba_roster WHERE team='Toronto Raptors')) ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which teams have the most young players in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE < 25 GROUP BY team order by num_players desc;"}
+{"question": "What is the position with the most players in the age range of 22-25 in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE AGE BETWEEN 22 AND 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who are older than the average age of all players in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster);"}
+{"question": "What is the most common position for young players in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE AGE BETWEEN 22 AND 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Which three teams in the NBA have the largest rosters", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC LIMIT 3;"}
+{"question": "What are the top 5 teams with the oldest average age of players", "sql": "SELECT Team, AVG(AGE) as average_age FROM nba_roster GROUP BY Team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What age group has the most players in the NBA", "sql": "SELECT AGE, COUNT(*) as count FROM nba_roster GROUP BY AGE ORDER BY count DESC;"}
+{"question": "What is the average age of players in each position in the NBA", "sql": "SELECT AVG(AGE) AS avg_age, POS FROM nba_roster GROUP BY POS ORDER BY avg_age;"}
+{"question": "What are the top 3 highest-paid players from Duke University", "sql": "SELECT name, salary FROM nba_roster WHERE COLLEGE = 'Duke' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 3;"}
+{"question": "Which team has the most non-point guards", "sql": "SELECT name, team FROM nba_roster WHERE team IN (SELECT team FROM nba_roster WHERE POS='PG' GROUP BY team HAVING COUNT(*) > 5 ORDER BY COUNT(*) DESC LIMIT 1) AND POS!= 'PG';"}
+{"question": "Who is the player with the highest jersey number on the Boston Celtics", "sql": "SELECT NAME FROM nba_roster WHERE team='Boston Celtics' AND CAST(Jersey AS INTEGER) = (SELECT MAX(CAST(Jersey AS INTEGER)) FROM nba_roster WHERE team='Boston Celtics');"}
+{"question": "Which teams have the most players aged 25 or older", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE age >= 25 GROUP BY team;"}
+{"question": "How many players in the NBA are older than 20 years old", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE age + (2022 - 2000) > 10;"}
+{"question": "What is the average age and height of Power Forward players in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(AGE, 1, INSTR(AGE,' ') - 1) AS INTEGER)) as average_age, AVG(CAST(SUBSTR(AGE, INSTR(AGE,' ') + 1) AS FLOAT)) as average_height FROM nba_roster WHERE POS = 'PF';"}
+{"question": "Which team has the most players under the age of 36", "sql": "SELECT team, COUNT(*) FROM nba_roster WHERE AGE < 3*12 GROUP BY team ORDER BY COUNT(*) DESC LIMIT 1;"}
+{"question": "What is the number of players under the age of 25 with known heights for each position in the NBA", "sql": "SELECT pos, COUNT(*) as num_players FROM nba_roster WHERE AGE < 25 AND HT!= 'NA' GROUP BY pos;"}
+{"question": "What is the average salary of NBA players 25 years old or younger", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE <= 25 AND SALARY!= '--';"}
+{"question": "What is the most popular position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE POS!= 'NA' GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What position has the most players earning a salary above the average salary in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) > (SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) FROM nba_roster) GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Which three teams in the NBA have the highest average salaries", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 3;"}
+{"question": "Which five colleges have produced the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5;"}
+{"question": "Which teams have the most players who are at least 5 years older than the youngest player in the league", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE age - (SELECT MIN(age) FROM nba_roster) > 5 GROUP BY team ORDER BY num_players DESC;"}
+{"question": "Who are the Boston Celtics players aged 25 or older, listed in order of their jersey number", "sql": "SELECT name FROM nba_roster WHERE team='Boston Celtics' AND age>=25 ORDER BY CAST(Jersey AS INTEGER) ASC;"}
+{"question": "What is the average salary of all NBA players who are 25 years or older", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE >= 25;"}
+{"question": "What is the highest-paid player on the Cleveland Cavaliers", "sql": "SELECT salary, name FROM nba_roster WHERE team='Cleveland Cavaliers' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the highest-paid player on the Toronto Raptors", "sql": "SELECT name, salary FROM nba_roster WHERE team='Toronto Raptors' AND salary!= '--' ORDER BY CAST(REPLACE(REPLACE(salary, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the highest average salary for each position in the NBA", "sql": "SELECT POS, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS max_salary FROM nba_roster GROUP BY POS;"}
+{"question": "What is the average salary of all NBA players, excluding those with unknown salaries", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "Who is the youngest player on the Toronto Raptors", "sql": "SELECT NAME FROM nba_roster WHERE AGE = (SELECT MIN(AGE) FROM nba_roster WHERE TEAM = 'Toronto Raptors');"}
+{"question": "What is the height of the 75th percentile of NBA players", "sql": "SELECT CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (SELECT COUNT(*) FROM nba_roster)*0.75;"}
+{"question": "Who are the top 5 players in the NBA with the highest total value, considering both their salary and jersey number", "sql": "SELECT name, (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) + CAST(Jersey AS INTEGER)) as total_value, POS FROM nba_roster WHERE SALARY!= '--' AND Jersey!= 'NA' ORDER BY total_value DESC LIMIT 5;"}
+{"question": "Which colleges have more than one player in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "Who is the highest-paid guard on the Los Angeles Lakers", "sql": "SELECT name FROM nba_roster WHERE team='Los Angeles Lakers' AND POS='G' AND SALARY!='--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which players in the NBA are taller than 6'7", "sql": "SELECT name FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "What is the average height of all players in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height from nba_roster;"}
+{"question": "How many players on the Boston Celtics did not attend college", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Boston Celtics' AND COLLEGE!='--';"}
+{"question": "What is the team with the most players 30 or older in the NBA", "sql": "SELECT TEAM, COUNT(*) as num_players FROM nba_roster WHERE AGE >= 30 GROUP BY TEAM ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What are the top 10 most common positions in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 10;"}
+{"question": "What is the number of players on each team who earn more than $1,000,000 and the total number of players on each team", "sql": "SELECT team, COUNT(*) as num_players, SUM(CASE WHEN CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) > 1000000 THEN 1 ELSE 0 END) as num_players_above_1m FROM nba_roster WHERE SALARY!= '--' GROUP BY team;"}
+{"question": "Who is the player with the highest average salary in the NBA", "sql": "SELECT name, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY name ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "Who are the top 5 players in the NBA in terms of their total value, combining their salary and jersey number", "sql": "SELECT name, (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) + CAST(Jersey AS INTEGER)) as total_value, POS FROM nba_roster WHERE SALARY!= '--' ORDER BY total_value DESC LIMIT 5;"}
+{"question": "How many players are on the Toronto Raptors", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "Which team has the most players over the age of 30", "sql": "SELECT Team, COUNT(*) as count FROM nba_roster WHERE CAST(AGE as INTEGER) > 30 GROUP BY Team ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average height of point guards in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as average_height FROM nba_roster WHERE POS='PG';"}
+{"question": "What is the average salary of players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5 AND SALARY!= '--';"}
+{"question": "Which five teams in the NBA have the most players on their roster", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC LIMIT 5;"}
+{"question": "What college has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as frequency FROM nba_roster GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who are 25 years old or younger", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What are the top 5 players who have played the most seasons in each position in the NBA", "sql": "SELECT pos, name, COUNT(*) as seasons_played FROM nba_roster WHERE SALARY!= '--' GROUP BY pos, name ORDER BY seasons_played DESC LIMIT 5;"}
+{"question": "What are the average salaries for each position in the NBA, and which positions have the highest average salaries", "sql": "SELECT POS, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster GROUP BY POS ORDER BY avg_salary DESC;"}
+{"question": "Which team has the most players who are significantly older than the average age of all NBA players", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the most common height range among NBA players under the age of 25", "sql": "SELECT HT, COUNT(*) as count FROM nba_roster WHERE AGE <= 25 GROUP BY HT ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the breakdown of players by position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS;"}
+{"question": "What are the top 3 teams in the NBA with the highest average salary", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster GROUP BY team ORDER BY average_salary DESC LIMIT 3;"}
+{"question": "What is the most common age range and position combination among NBA players", "sql": "SELECT age_range, POS, COUNT(*) AS count FROM (SELECT CASE WHEN AGE <= 25 THEN 'Young' WHEN AGE <= 30 THEN 'Established' ELSE 'Veteran' END AS age_range, POS FROM nba_roster) AS subquery GROUP BY age_range, POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the team with the tallest average height in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster group by team order by height desc limit 1;"}
+{"question": "What is the number of the player with the highest jersey number in the NBA", "sql": "SELECT NAME, JERSEY FROM nba_roster ORDER BY CAST(JERSEY AS INTEGER) DESC LIMIT 1;"}
+{"question": "How many players in the NBA are older than the sum of their jersey number and age", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE age + CAST(SUBSTR(Jersey, 1, INSTR(Jersey,' ')-1) AS INTEGER) > 5;"}
+{"question": "How many players in the NBA are under the age of 25", "sql": "SELECT COUNT(*) AS under_25 FROM nba_roster WHERE AGE < 25;"}
+{"question": "What are the top 5 teams in the NBA with the highest average salary", "sql": "SELECT Team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster GROUP BY Team ORDER BY avg_salary DESC LIMIT 5;"}
+{"question": "What is the average age of players who attended the same college as Otto Porter Jr.", "sql": "SELECT COLLEGE, AVG(AGE) AS avg_age FROM nba_roster WHERE COLLEGE IN (SELECT COLLEGE FROM nba_roster WHERE NAME = 'Otto Porter Jr.') GROUP BY COLLEGE;"}
+{"question": "How many players in the NBA are at least 5 years older than the youngest player in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT MIN(AGE) FROM nba_roster) > 5;"}
+{"question": "How many players in the NBA are more than 5 years older than the average age of all players in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5;"}
+{"question": "What is the average salary of the Toronto Raptors players", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE team='Toronto Raptors' AND SALARY!= '--';"}
+{"question": "Who is the highest-paid player on the Los Angeles Lakers who attended college", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Los Angeles Lakers' AND COLLEGE!= '--' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the position with the most players in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA, excluding those with unknown salaries", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster WHERE SALARY!= '--');"}
+{"question": "What is the name and jersey number of the player with the highest jersey number in the NBA", "sql": "SELECT NAME, JERSEY FROM nba_roster WHERE JERSEY!= 'NA' ORDER BY CAST(JERSEY AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average age of the players in the NBA who are at least 6 feet 7 inches tall", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "What is the average age of players in the NBA who have a total of 12 years of experience or less", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE * 12 * 5 <= (SELECT SUM(AGE * 12) FROM nba_roster);"}
+{"question": "Which team has the most players from the University of Michigan", "sql": "SELECT team, COUNT(*) as count FROM nba_roster WHERE COLLEGE='Michigan' GROUP BY team ORDER BY count DESC LIMIT 1;"}
+{"question": "Who is the tallest Power Forward in the NBA", "sql": "SELECT POS, NAME, MAX(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS max_height FROM nba_roster WHERE POS='PF';"}
+{"question": "What is the average age for each position in the NBA", "sql": "SELECT pos, AVG(AGE) AS avg_age FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF', 'PF', 'C') GROUP BY pos;"}
+{"question": "How many players are currently on the Toronto Raptors", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "Which teams in the NBA have the oldest average age among their players", "sql": "SELECT TEAM, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY TEAM ORDER BY avg_age DESC;"}
+{"question": "What is the distribution of players across different positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS;"}
+{"question": "How many players in the NBA have been in the league for more than 10 years longer than the average age of all players", "sql": "SELECT COUNT(*) as long_tenured_players FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster) + 10;"}
+{"question": "What is the average height of all NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) as INTEGER)) FROM nba_roster;"}
+{"question": "Who is the oldest player from the University of Michigan to have played in the NBA", "sql": "SELECT NAME, MAX(AGE) as oldest FROM nba_roster WHERE COLLEGE='Michigan';"}
+{"question": "What are the most common colleges represented in the NBA, excluding players who did not attend college or did not disclose their college information", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE;"}
+{"question": "What is the average age of all NBA players who are older than 5 years old", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE > 5;"}
+{"question": "Who are the top 5 oldest Point Guards in the NBA", "sql": "SELECT * FROM nba_roster WHERE POS='PG' AND AGE > 25 ORDER BY AGE DESC LIMIT 5;"}
+{"question": "How many players in the NBA are older than 5 years old", "sql": "SELECT COUNT(*) FROM nba_roster WHERE age > 5;"}
+{"question": "How many players in the NBA have had a longer career than the average player and attended a college other than '--'", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5 AND COLLEGE!= '--';"}
+{"question": "What are the top 5 colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC LIMIT 5;"}
+{"question": "Who is the highest-paid player on the Boston Celtics who plays either Small Forward or Power Forward", "sql": "SELECT name, salary FROM nba_roster WHERE team='Boston Celtics' AND (POS='SF' OR POS='PF' OR POS='SF/PF') AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who attended a college other than '--'?", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "How many young players in the NBA are earning a salary", "sql": "SELECT COUNT(*) as young_players FROM nba_roster WHERE AGE <= 25 AND SALARY!= '--';"}
+{"question": "Who are the top 3 players with the highest total value in the NBA", "sql": "SELECT name, team, (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) + CAST(Jersey AS INTEGER)) AS total_value FROM nba_roster WHERE SALARY!= '--' AND Jersey!= 'NA' ORDER BY total_value DESC LIMIT 3;"}
+{"question": "What is the average age of players on each team in the NBA, excluding those with unknown salaries", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_age ASC;"}
+{"question": "Which team has the most players who attended college", "sql": "SELECT TEAM, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY TEAM ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the average age and maximum salary for each position in the NBA", "sql": "SELECT pos, AVG(AGE) as avg_age, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "How many players in the NBA are 25 years old or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What are the top 5 players in the NBA in terms of salary-to-age ratio", "sql": "SELECT NAME, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as salary, AGE FROM nba_roster WHERE SALARY!= '--' ORDER BY (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)/AGE) DESC LIMIT 5;"}
+{"question": "What are the top 3 players with the highest jersey numbers who are not Point Guards", "sql": "SELECT jersey, name FROM nba_roster WHERE pos!= 'PG' ORDER BY CAST(Jersey AS INTEGER) DESC LIMIT 3;"}
+{"question": "What is the most common college attended by NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS frequency FROM nba_roster GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "Who is the tallest player in the league who plays either point guard, shooting guard, or small forward", "sql": "SELECT NAME, HT FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF') ORDER BY HT DESC LIMIT 1;"}
+{"question": "What is the average salary of NBA players who attended the University of Michigan", "sql": "SELECT COLLEGE, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE COLLEGE='Michigan' GROUP BY COLLEGE;"}
+{"question": "What is the tallest team in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) AS INTEGER)) AS average_height FROM nba_roster GROUP BY team ORDER BY average_height DESC LIMIT 1;"}
+{"question": "What is the average height and age of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height, AVG(CAST(AGE AS INTEGER)) as age FROM nba_roster;"}
+{"question": "What positions have more than 5 years of experience compared to the average age of all players in the NBA", "sql": "SELECT POS, COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5 GROUP BY POS;"}
+{"question": "What is the second-highest paid player in the NBA", "sql": "SELECT name FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE SALARY!= '--') - 1;"}
+{"question": "What is the average age of the youngest power forward in the NBA", "sql": "SELECT AVG(AGE) AS avg_age, CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 AS height FROM nba_roster WHERE POS='PF' GROUP BY height ORDER BY avg_age ASC LIMIT 1;"}
+{"question": "What are the top 5 highest-paid players for each position in the NBA", "sql": "WITH ranked_positions AS (SELECT *, DENSE_RANK() OVER (PARTITION BY POS ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC) as rank FROM nba_roster WHERE SALARY!= '--') SELECT * FROM ranked_positions WHERE rank <= 5;"}
+{"question": "What is the tallest player on each team in the NBA", "sql": "SELECT team, MAX(HT), name as max_height FROM nba_roster WHERE HT!= 'NA' GROUP BY team;"}
+{"question": "What is the position with the oldest players in the NBA", "sql": "SELECT POS, AVG(AGE) AS avg_age FROM nba_roster GROUP BY POS ORDER BY avg_age DESC LIMIT 1;"}
+{"question": "What is the average height of the players in the Boston Celtics", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS average_height FROM nba_roster WHERE team='Boston Celtics';"}
+{"question": "How many Los Angeles Lakers players did not attend college", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE team='Los Angeles Lakers' AND COLLEGE!='--';"}
+{"question": "What is the average salary of players on the Toronto Raptors who are 25 years or older", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE team='Toronto Raptors' AND age>=25 AND SALARY!= '--';"}
+{"question": "Which teams have the most players under the age of 25", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE < 25 GROUP BY team ORDER BY num_players DESC;"}
+{"question": "What are the average height and average salary for each team in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster GROUP BY team;"}
+{"question": "Who are the top 5 players in the NBA with the highest jersey numbers", "sql": "SELECT NAME, JERSEY FROM nba_roster WHERE JERSEY!= 'NA' ORDER BY CAST(JERSEY AS INTEGER) DESC LIMIT 5;"}
+{"question": "Who is the highest-paid player from the University of Michigan in the NBA", "sql": "select name, salary from nba_roster where college='Michigan' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average age and height of players for each team in the NBA", "sql": "SELECT AVG(AGE) as avg_age, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height, TEAM FROM nba_roster GROUP BY TEAM;"}
+{"question": "What is the average age and height of players on teams with more than 5 players in the NBA", "sql": "SELECT TEAM, AVG(AGE) as avg_age, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height FROM nba_roster GROUP BY TEAM HAVING COUNT(*) > 5;"}
+{"question": "What is the average height of NBA players by age group", "sql": "SELECT AGE, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS avg_height FROM nba_roster GROUP BY AGE;"}
+{"question": "What is the average age of NBA players who play as Point Guard or Shooting Guard", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE POS = 'PG' OR POS = 'SG' OR POS = 'PG/SG' OR POS = 'SG/PG';"}
+{"question": "Who are the top 4 highest-paid players in the NBA", "sql": "SELECT POS, NAME, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as salary FROM nba_roster ORDER BY salary DESC LIMIT 5 OFFSET 0;"}
+{"question": "Who is the highest-paid player on the Boston Celtics who did not attend college", "sql": "SELECT NAME FROM nba_roster WHERE team='Boston Celtics' AND COLLEGE!='--' AND SALARY=(SELECT MAX(SALARY) FROM nba_roster WHERE team='Boston Celtics' AND COLLEGE!='--');"}
+{"question": "What is the average age of players in the NBA who are taller than 6 feet 7 inches", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 > 6.67;"}
+{"question": "What is the 99th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average weight in the NBA?", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average height in the NBA?", "sql": "select AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster;"}
+{"question": "Can you tell me how many players are in the NBA?", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "sql": "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "sql": "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "sql": "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "sql": "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "sql": "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "sql": "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What's the median age of the Miami Heat?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')/2;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "sql": "select avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary from nba_roster where POS = 'PF';"}

+ 128 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/archive/generated_queries_v2_large_filtered_cleaned.jsonl

@@ -0,0 +1,128 @@
+{"question": "How many players are on each team in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster GROUP BY team;"}
+{"question": "Who is the tallest player in the NBA roster", "sql": "SELECT name, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster GROUP BY name ORDER BY height DESC LIMIT 1;"}
+{"question": "What is the average age of NBA players", "sql": "SELECT AVG(AGE) FROM nba_roster;"}
+{"question": "Who is the heaviest player in the NBA", "sql": "SELECT NAME, WT FROM nba_roster WHERE WT!= 'NA' ORDER BY CAST(SUBSTR(WT, 1, INSTR(WT,' ')-1) AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the total salary of all players in the NBA who are at least 6 feet 7 inches tall", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "Which three teams have the most players from a particular college", "sql": "SELECT team, COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY team, COLLEGE ORDER BY num_players DESC LIMIT 3;"}
+{"question": "What is the total salary for each team in the NBA, excluding teams with missing salary data", "sql": "SELECT team, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY total_salary DESC;"}
+{"question": "Which team has the most players under the age of 25", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE <= 25 GROUP BY team ORDER BY num_players DESC;"}
+{"question": "What is the average age of players in the NBA who are older than 5 years", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster WHERE AGE * 12 > 60;"}
+{"question": "What team pays its players the most, on average", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks who is older than 5 years old", "sql": "SELECT name, salary FROM nba_roster WHERE team='Dallas Mavericks' AND POS='C' AND SALARY!= '--' AND age > 5 ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest-paid Power Forward on the Chicago Bulls", "sql": "SELECT name, salary FROM nba_roster WHERE team='Chicago Bulls' AND POS='PF' AND SALARY!='--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How many players are currently on the Toronto Raptors' roster", "sql": "SELECT COUNT(*) FROM nba_roster WHERE Team = 'Toronto Raptors';"}
+{"question": "How many players in the NBA are over the age of 30", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE > 30;"}
+{"question": "What is the most common position among players 25 or older in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE AGE >= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the jersey number of the player with the 75th percentile of jersey numbers in the NBA", "sql": "SELECT CAST(Jersey AS INTEGER) as percentile FROM nba_roster ORDER BY percentile LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster) * 0.75;"}
+{"question": "What is the most common position among the Toronto Raptors players", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE team='Toronto Raptors' GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Which team has the heaviest average weight", "sql": "SELECT team, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')-1) AS INTEGER) + CAST(SUBSTR(WT, INSTR(WT,' ')+1) AS FLOAT)/16) as average_weight FROM nba_roster WHERE WT!= 'NA' GROUP BY team ORDER BY average_weight DESC LIMIT 1;"}
+{"question": "Who are the top 3 highest-paid Power Forwards in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE POS = 'PF' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 3;"}
+{"question": "Which teams have the smallest rosters and what is the average age of their players", "sql": "SELECT team, COUNT(*) AS roster_size, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY roster_size ASC;"}
+{"question": "Which team has the highest average salary for players who attended college", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as salary FROM nba_roster WHERE COLLEGE!= '--' GROUP BY team ORDER BY salary DESC LIMIT 1;"}
+{"question": "Which team has the shortest average height among players 25 years old or younger", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE AGE <= 25 GROUP BY team ORDER BY height ASC LIMIT 1;"}
+{"question": "Which three teams have the tallest players on average", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster WHERE HT!= 'NA' GROUP BY team ORDER BY average_height DESC LIMIT 3;"}
+{"question": "Who are the top 3 players in the league by salary, excluding those who did not attend college", "sql": "SELECT name, SALARY FROM nba_roster WHERE COLLEGE!= '--' ORDER BY CAST(SUBSTRING(SALARY, 2) AS INTEGER) DESC LIMIT 3;"}
+{"question": "Which five teams have the oldest average age among their players", "sql": "SELECT TEAM, AVG(AGE) as avg_age FROM nba_roster WHERE POS!= '--' GROUP BY TEAM ORDER BY avg_age DESC LIMIT 5;"}
+{"question": "Which three teams in the NBA have the highest average salary among their players", "sql": "SELECT team, AVG(CAST(SUBSTRING(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) AS avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 3;"}
+{"question": "Who is the highest-paid player in the NBA who did not attend college", "sql": "SELECT name, SALARY FROM nba_roster WHERE SALARY!= '--' AND COLLEGE = '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average age of players in the Toronto Raptors", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE TEAM = 'Toronto Raptors';"}
+{"question": "What is the player with the highest jersey number that is not 'NA'", "sql": "SELECT MAX(Jersey) as jersey_num, name FROM nba_roster WHERE Jersey!= 'NA' GROUP BY name ORDER BY jersey_num DESC LIMIT 1;"}
+{"question": "Who is the youngest player in the NBA", "sql": "SELECT name FROM nba_roster ORDER BY AGE ASC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who are older than 5 years old", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE > 5;"}
+{"question": "Who is the highest-paid player on the Los Angeles Lakers", "sql": "SELECT name, salary FROM nba_roster WHERE team='Los Angeles Lakers' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which colleges tend to produce the oldest players in the NBA", "sql": "SELECT COLLEGE, AVG(AGE) AS average_age FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY average_age DESC;"}
+{"question": "What percentage of players in the NBA play each position", "sql": "SELECT POS, COUNT(*) as count, ROUND(COUNT(*)*100.0/(SELECT COUNT(*) FROM nba_roster),2) as percentage FROM nba_roster WHERE POS!= '--' GROUP BY POS ORDER BY percentage DESC;"}
+{"question": "What are the top 10 teams with the most players in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC LIMIT 10;"}
+{"question": "What is the average age of players in the NBA who are older than the average age of all players in the league plus 5 years", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster) + 5;"}
+{"question": "Which team has the most players who are older than the average age of all players in the NBA plus 5 years", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster) + 5 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What are the average height and weight for each team in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) as avg_weight FROM nba_roster GROUP BY team;"}
+{"question": "What are the top 3 teams with the highest average salary", "sql": "SELECT Team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS avg_salary FROM nba_roster GROUP BY Team ORDER BY avg_salary DESC LIMIT 3;"}
+{"question": "What position has the most players in the NBA roster", "sql": "SELECT POS, COUNT(*) AS count, POS FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the age of the 75th percentile of NBA players", "sql": "SELECT age FROM nba_roster WHERE AGE!= '--' ORDER BY age LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE AGE!= '--')*75/100-1;"}
+{"question": "What is the average age and salary of NBA players, excluding those with unknown salaries", "sql": "SELECT AVG(AGE) AS average_age, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the average age of players in each position group in the NBA", "sql": "SELECT POS, AVG(AGE) AS avg_age FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF', 'PF', 'C') GROUP BY POS;"}
+{"question": "What team has the most players at the point guard position", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE POS='PG' GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the name of the heaviest player on the Los Angeles Lakers", "sql": "SELECT name FROM nba_roster WHERE team='Los Angeles Lakers' AND WT=(SELECT MAX(WT) FROM nba_roster WHERE team='Los Angeles Lakers');"}
+{"question": "Who are the top 5 players in the NBA in terms of salary-to-age ratio", "sql": "SELECT name, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as salary, AGE FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)/AGE DESC LIMIT 5;"}
+{"question": "Which NBA teams have the most players who attended college", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!='--' GROUP BY team ORDER BY num_players DESC;"}
+{"question": "What is the highest paid player for each position in the NBA", "sql": "SELECT pos, name, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY pos ORDER BY pos;"}
+{"question": "Which NBA teams have the most players", "sql": "SELECT Team, COUNT(*) as count FROM nba_roster GROUP BY Team ORDER BY count DESC;"}
+{"question": "What is the position with the most players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE <= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What players in the NBA have a height greater than or equal to 6 feet 7 inches", "sql": "SELECT NAME FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "Who are the top 3 highest-paid players under the age of 25 in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE AGE < 25 ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 3;"}
+{"question": "What are the names of all the players on the Toronto Raptors who are 25 years or older", "sql": "SELECT name FROM nba_roster WHERE age >= 25 AND team = 'Toronto Raptors';"}
+{"question": "What is the position with the shortest average height in the NBA", "sql": "SELECT pos, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height, COUNT(*) as count FROM nba_roster GROUP BY pos ORDER BY height ASC LIMIT 1;"}
+{"question": "What is the average age of the players on the Memphis Grizzlies", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE team='Memphis Grizzlies';"}
+{"question": "What are the average ages of the players on each NBA team, listed from youngest to oldest", "sql": "SELECT team, AVG(AGE) as average_age FROM nba_roster GROUP BY team ORDER BY average_age ASC;"}
+{"question": "What is the average age and height for each position in the NBA", "sql": "SELECT POS, AVG(AGE) AS avg_age, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS avg_height FROM nba_roster GROUP BY POS;"}
+{"question": "What is the highest-paid player in the NBA", "sql": "SELECT name, SALARY FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(SUBSTR(SALARY, 2) AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which team has the most players 25 or older", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE >= 25 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the average age of NBA players who are at least 6 feet 7 inches tall", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "What are all the players in the NBA whose last name is Johnson", "sql": "SELECT * FROM nba_roster WHERE NAME LIKE '%Johnson';"}
+{"question": "What is the average salary for players from each college, and which colleges produce the most highly paid NBA players", "sql": "SELECT COLLEGE, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY avg_salary DESC;"}
+{"question": "Who are the top 3 highest-paid players on the Los Angeles Lakers", "sql": "SELECT name, SALARY FROM nba_roster WHERE team='Los Angeles Lakers' ORDER BY CAST(SUBSTRING(SALARY, 2) AS INTEGER) DESC LIMIT 3;"}
+{"question": "What is the average height of all NBA players who are 25 years old or younger", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as average_height FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What college has the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS COUNT FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY COUNT(*) DESC LIMIT 1;"}
+{"question": "Who are the 25-year-old players on the Toronto Raptors", "sql": "SELECT name FROM nba_roster WHERE team='Toronto Raptors' AND age=25;"}
+{"question": "Who is the highest-paid player in the NBA who attended college", "sql": "SELECT name, SALARY FROM nba_roster WHERE COLLEGE!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What college and position combination has the most players in the NBA", "sql": "SELECT COLLEGE, POS, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE, POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Who is the heaviest player in the NBA roster", "sql": "SELECT name, WT, CAST(SUBSTR(WT, 1, INSTR(WT,' ')-1) AS INTEGER) as weight FROM nba_roster WHERE WT!= 'NA' ORDER BY weight DESC LIMIT 1;"}
+{"question": "What is the average height of players on each team, excluding those under 25 and with unknown heights", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height FROM nba_roster WHERE HT!= 'NA' AND age > 25 GROUP BY team;"}
+{"question": "What is the average salary of NBA players over the age of 25", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 25 AND SALARY!= '--';"}
+{"question": "What are the 5 oldest players in the NBA", "sql": "SELECT NAME, AGE FROM nba_roster WHERE AGE != '--' ORDER BY AGE DESC LIMIT 5;"}
+{"question": "Which team has the most players over the age of 5 in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE age > 5 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA, excluding those under the age of 6 and those with unknown salaries", "sql": "SELECT name, team FROM nba_roster WHERE age > 5 AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average height and weight of players on each NBA team", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as avg_height, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) AS INTEGER)) as avg_weight FROM nba_roster GROUP BY team;"}
+{"question": "Which positions in the NBA have the most players and which positions have the oldest players on average", "sql": "SELECT POS, COUNT(*) as count, AVG(AGE) as average_age FROM nba_roster GROUP BY POS ORDER BY count DESC;"}
+{"question": "What is the position with the tallest players in the NBA", "sql": "SELECT POS, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height FROM nba_roster GROUP BY POS ORDER BY avg_height DESC LIMIT 1;"}
+{"question": "What are the top 3 tallest players in the NBA", "sql": "SELECT NAME, HT, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster GROUP BY NAME, HT ORDER BY height DESC LIMIT 3;"}
+{"question": "Who is the highest-paid player on the Toronto Raptors with a jersey number greater than 10", "sql": "SELECT name, salary FROM nba_roster WHERE team='Toronto Raptors' AND CAST(Jersey AS INTEGER) > 10 AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which teams have the most players in their roster", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster GROUP BY team ORDER BY num_players DESC;"}
+{"question": "What is the average salary of all NBA players, excluding those who are not paid or have an unknown position", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' AND POS!= 'NA';"}
+{"question": "Which team has invested the most in young talent, with an average salary for players 5 years or less younger than the average age of all players", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE <= (SELECT AVG(AGE) FROM nba_roster) * 5 GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "Which 5 teams have the most players who have publicly disclosed their college information", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY team ORDER BY num_players DESC LIMIT 5;"}
+{"question": "What is the average age of players by position in the NBA", "sql": "SELECT POS, AVG(AGE) as average_age FROM nba_roster GROUP BY POS ORDER BY average_age ASC;"}
+{"question": "What is the average height of the tallest positions in the NBA", "sql": "SELECT POS, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height FROM nba_roster GROUP BY POS ORDER BY avg_height DESC;"}
+{"question": "What is the number of players on the Chicago Bulls who are 25 years old or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Chicago Bulls' AND AGE <= 25;"}
+{"question": "What are the average heights for each position in the NBA, and which position has the tallest players on average", "sql": "SELECT pos, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height, COUNT(*) as count FROM nba_roster WHERE HT!= 'NA' GROUP BY pos ORDER BY avg_height DESC;"}
+{"question": "Which team has the oldest average age among its guards and forwards", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF', 'PF', 'C') GROUP BY team ORDER BY average_age DESC LIMIT 1;"}
+{"question": "Who is the pointguard for the Golden State Warriors?", "sql": "select name from nba_roster where team='Golden State Warriors' and POS='PG';"}
+{"question": "What is the number of players on the Chicago Bulls who are 25 years old or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Chicago Bulls' AND AGE <= 25;"}
+{"question": "Who is the highest-paid player on the Los Angeles Lakers", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE team='Los Angeles Lakers' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid player in the NBA?", "sql": "SELECT NAME, salary FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What team is LaMelo Ball on?", "sql": "select team from nba_roster where name='LaMelo Ball';"}
+{"question": "How much does Lonzo Ball weigh?", "sql": "select wt from nba_roster where name='Lonzo Ball';"}
+{"question": "What college sent the most players to the current NBA?", "sql": "select college from nba_roster where college != '--'  group by college order by count(*) desc limit 1;"}
+{"question": "How old is Lebron James?", "sql": "select age from nba_roster where name='LeBron James';"}
+{"question": "What is the most popular jersey number in the current NBA?", "sql": "select Jersey from nba_roster where Jersey != 'NA' group by Jersey order by count(*) desc limit 1;"}
+{"question": "Can you give me a list of all the players without college data?", "sql": "SELECT name FROM nba_roster WHERE COLLEGE IS NULL OR COLLEGE = '--';"}
+{"question": "What team has the smallest roster?", "sql": "select team from nba_roster group by team order by count(*) asc limit 1;"}
+{"question": "What team has the largest roster?", "sql": "select team, count(*) from nba_roster group by team order by count(*) desc limit 1;"}
+{"question": "What team is paying its players the most in total?", "sql": "select team, sum(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) from nba_roster group by team order by sum(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) desc limit 1;"}
+{"question": "Which team is paying its players the least?", "sql": "select team from nba_roster group by team order by sum(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) asc limit 1;"}
+{"question": "Which team is on average the tallest?", "sql": "select team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster group by team order by height desc limit 1;"}
+{"question": "Which team is on average the shortest?", "sql": "select team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster group by team order by height asc limit 1;"}
+{"question": "Who are the tallest 5 centers in the league?", "sql": "SELECT name, HT FROM nba_roster WHERE POS = 'C' ORDER BY HT DESC LIMIT 5;"}
+{"question": "Who are the top 5 highest paid power forwards in the league?", "sql": "SELECT NAME, salary FROM nba_roster WHERE POS = 'PF' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5;"}
+{"question": "What is the median salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*50/100-1;"}
+{"question": "What is the average salary in the NBA?", "sql": "SELECT avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the 99th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)*50/100-1;"}
+{"question": "What is the average weight in the NBA?", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)*50/100-1;"}
+{"question": "What is the average height in the NBA?", "sql": "select AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster;"}
+{"question": "Can you tell me how many players are in the NBA?", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "sql" : "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "sql" : "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "sql" : "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "sql" : "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "sql" : "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "sql" : "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "sql" : "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)*50/100-1;"}
+{"question": "What's the median age of the Miami Heat?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')*50/100-1;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "sql": "select avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary from nba_roster where POS = 'PF';"}

+ 159 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries.jsonl

@@ -0,0 +1,159 @@
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTRING(HT, INSTR(HT,'')+1) AS INTEGER)/12) as average_height FROM nba_roster WHERE HT!= 'NA';"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What are the oldest players on each team with a roster size of 6 or more", "sql": "SELECT NAME FROM nba_roster WHERE AGE IN (SELECT MAX(AGE) FROM nba_roster WHERE TEAM IN (SELECT TEAM FROM nba_roster GROUP BY TEAM HAVING COUNT(*) > 5))"}
+{"question": "What is the average height of the players on the Toronto Raptors", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the highest-paid Toronto Raptors player who attended college", "sql": "SELECT name, salary FROM nba_roster WHERE team='Toronto Raptors' AND COLLEGE!='--' AND SALARY!='--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1"}
+{"question": "What is the most common height among NBA players", "sql": "SELECT HT, COUNT(*) as count FROM nba_roster WHERE HT IS NOT NULL GROUP BY HT ORDER BY count DESC LIMIT 1"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE IS NOT NULL GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS average_height FROM nba_roster"}
+{"question": "What is the average age of the players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE IS NOT NULL"}
+{"question": "What is the position with the most players in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE SALARY!= '--' GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average height of players on each NBA team, excluding players with unknown heights", "sql": "SELECT TEAM, AVG(CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER)) as avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY TEAM ORDER BY avg_height DESC"}
+{"question": "What are the 5 most common heights among NBA players", "sql": "SELECT HT, COUNT(*) AS count FROM nba_roster GROUP BY HT ORDER BY count DESC LIMIT 5"}
+{"question": "What are the top 5 colleges with the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5"}
+{"question": "What is the average age of the players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE IS NOT NULL"}
+{"question": "Which players in the NBA have attended the most colleges", "sql": "SELECT NAME, COLLEGE, COUNT(*) as num_colleges FROM nba_roster WHERE COLLEGE!= '--' GROUP BY NAME, COLLEGE ORDER BY num_colleges DESC;"}
+{"question": "What is the average age of the players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "Who are the top 5 highest-paid players in the NBA", "sql": "SELECT * FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5"}
+{"question": "What is the average height of players on each NBA team", "sql": "SELECT team, AVG(CAST(SUBSTRING(HT, 1, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTRING(HT, INSTR(HT,'')+1) AS INTEGER) / 12.0) as avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY team"}
+{"question": "Who are the top 3 highest-paid players in the NBA", "sql": "SELECT name, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY name ORDER BY total_salary DESC LIMIT 3"}
+{"question": "Which team has the most players in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster GROUP BY team ORDER BY num_players DESC LIMIT 1"}
+{"question": "What is the total salary of all players in the NBA who are 6'8", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68;"}
+{"question": "What is the average age of players on each team in the NBA", "sql": "SELECT team, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY team"}
+{"question": "How many players in the NBA have a non-null salary and college information, and play one of the five main positions", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF', 'PF', 'C') AND SALARY!= '--' AND COLLEGE!= '--'"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster;"}
+{"question": "What is the average salary of NBA players who are at least 5 years old", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 5"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What is the most common age range among NBA players", "sql": "SELECT AGE, COUNT(*) AS count FROM nba_roster GROUP BY AGE ORDER BY count DESC LIMIT 1"}
+{"question": "Which team has the most players in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC LIMIT 1"}
+{"question": "What is the average salary of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "How many players in the NBA are 68 inches tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68;"}
+{"question": "What is the average salary of Power Forwards in the NBA who are at least 25 years old", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE >= 25 AND POS = 'PF';"}
+{"question": "What is the average age of 6-foot Power Forwards in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) = 6 AND POS='PF';"}
+{"question": "What is the heaviest Power Forward in the NBA", "sql": "SELECT NAME, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) AS INTEGER)) AS avg_weight FROM nba_roster WHERE POS='PF' GROUP BY NAME ORDER BY avg_weight DESC LIMIT 1"}
+{"question": "What is the number of players on each team in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team"}
+{"question": "What is the average height of NBA players who are 25 years old or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE age >= 25"}
+{"question": "What are the top 3 teams with the highest average salaries in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 3"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What are the names of the players in the NBA who are exactly 6 feet 8 inches tall", "sql": "SELECT NAME, HT FROM nba_roster WHERE CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = 68 ORDER BY HT ASC;"}
+{"question": "What is the college with the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS frequency FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1"}
+{"question": "What is the average age of the players in the NBA", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE IS NOT NULL"}
+{"question": "What is the average height of NBA players who have a recorded height", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster WHERE HT IS NOT NULL"}
+{"question": "What is the average salary of NBA players who are 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$') - 1) as INTEGER)) FROM nba_roster WHERE CAST(AGE as INTEGER) >= 25"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What is the number of players on each team in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team"}
+{"question": "What is the average salary for each position in the NBA, excluding players with unknown salaries", "sql": "SELECT POS, AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$') - 1) as INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average age of players on each team in the NBA", "sql": "SELECT team, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY team"}
+{"question": "What are the top 3 positions with the highest total salary expenditure in the NBA", "sql": "SELECT pos, name, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY pos ORDER BY total_salary DESC LIMIT 3"}
+{"question": "Which colleges have the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "What is the average salary for each team in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team"}
+{"question": "What is the age range of players on each team in the NBA", "sql": "SELECT team, MIN(AGE) as youngest_player, MAX(AGE) as oldest_player FROM nba_roster WHERE AGE IS NOT NULL GROUP BY team"}
+{"question": "Which team has the most players who are 6'8", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68 GROUP BY team ORDER BY num_players DESC LIMIT 1"}
+{"question": "How many players in the NBA are over the age of 25", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE > 25"}
+{"question": "What is the average height of NBA players under the age of 25", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster WHERE AGE <= 25"}
+{"question": "What is the total salary of all players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE (AGE - (SELECT AVG(AGE) FROM nba_roster)) > 5"}
+{"question": "What is the median weight in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What are the top 5 teams with the oldest average age of players", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER)) AS average_height FROM nba_roster WHERE HT!= 'NA';"}
+{"question": "What is the average salary of the Los Angeles Lakers players", "sql": "SELECT AVG(CAST(SALARY AS INTEGER) ) AS average_salary FROM nba_roster WHERE team='Los Angeles Lakers';"}
+{"question": "What is the college that has produced the most players currently playing for the Boston Celtics", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE team='Boston Celtics' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What college has the most players in the NBA who are 30 years old or older", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE AGE >= 30 GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "How many players in the NBA are at least 5 years older than the youngest player in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT MIN(AGE) FROM nba_roster) > 5"}
+{"question": "What are the 5 colleges that have produced the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC LIMIT 5"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE POS!= '--' GROUP BY POS ORDER BY count DESC"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE IS NOT NULL"}
+{"question": "What are the teams with the highest average salaries in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster"}
+{"question": "What is the average salary of all NBA players", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster"}
+{"question": "What is the average age of the players on the Toronto Raptors", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "Which three teams have the most players from a single college", "sql": "SELECT team, COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY team, COLLEGE ORDER BY num_players DESC LIMIT 3"}
+{"question": "What is the average salary of NBA players 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) FROM nba_roster WHERE AGE >= 25"}
+{"question": "What is the total salary of all NBA players", "sql": "SELECT SUM(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)*1000000) FROM nba_roster"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) AS num_players FROM nba_roster GROUP BY POS;"}
+{"question": "What is the average salary for each age group in the NBA", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary, AGE as age_group FROM nba_roster WHERE SALARY!= '--' GROUP BY AGE ORDER BY age_group"}
+{"question": "What are the top 5 colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5"}
+{"question": "What is the most common position for players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE <= 25 GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "How many players in the NBA are 5 years or younger than the oldest player in the league", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE + 5 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What are the top 5 colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What are the most common heights in the NBA", "sql": "SELECT HT, COUNT(*) AS frequency FROM nba_roster GROUP BY HT ORDER BY frequency DESC LIMIT 5"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC"}
+{"question": "What is the average salary for each team in the NBA, excluding teams with unknown salaries", "sql": "SELECT TEAM, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY TEAM ORDER BY average_salary DESC"}
+{"question": "What is the college that has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "Who is the highest paid player in the NBA", "sql": "SELECT name, salary FROM nba_roster WHERE salary!= '--' ORDER BY CAST(REPLACE(REPLACE(salary, '$', ''), ',', '') AS INTEGER) DESC LIMIT 1"}
+{"question": "What is the average age of players who are 6'8", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) = 68"}
+{"question": "What is the average age of the players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE + (SELECT AVG(AGE) FROM nba_roster) > 5*12"}
+{"question": "What is the average age of the players in the NBA who are older than 5 years old", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE > 5*12"}
+{"question": "What are the top colleges that produce the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "How many players in the NBA are 6'8", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68;"}
+{"question": "What is the average salary for each team in the NBA", "sql": "SELECT Team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster GROUP BY Team"}
+{"question": "What are the top colleges represented in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What are the 5 teams with the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC"}
+{"question": "What is the average age of players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What is the most common height in the NBA", "sql": "SELECT SUBSTR(HT, 1, INSTR(HT,'')-1) as height, COUNT(*) as count FROM nba_roster GROUP BY SUBSTR(HT, 1, INSTR(HT,'')-1) ORDER BY count DESC LIMIT 1"}
+{"question": "What is the position with the most players in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the 75th percentile salary in the NBA", "sql": "SELECT HT, AVG(WT) as avg_weight FROM nba_roster WHERE HT IS NOT NULL AND WT IS NOT NULL GROUP BY HT ORDER BY avg_weight DESC LIMIT 1"}
+{"question": "Which college has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average salary of NBA players who are older than 25 years old", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 25"}
+{"question": "What is the average age of the players on the Toronto Raptors", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE TEAM = 'Toronto Raptors';"}
+{"question": "What is the average height of the players on the Los Angeles Lakers", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,'')+1) AS FLOAT)/12) AS height FROM nba_roster WHERE TEAM = 'Los Angeles Lakers';"}
+{"question": "What is the position with the most players in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average age of all players in the NBA who are older than 5 years old", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE > 5"}
+{"question": "How many players on each team have a height of 6'8", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE CAST(SUBSTRING(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68 GROUP BY team"}
+{"question": "What is the 99th percentile salary in the NBA?", "answer": "46741590", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "answer": "13932008", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "answer": "2413304", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "answer": "215", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average weight in the NBA?", "answer": "214.98", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "answer": "6.58333333333333", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average height in the NBA?", "answer": "6.54986111111111", "sql": "select AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster;"}
+{"question": "Can you tell me how many players are in the NBA?", "answer": "600", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "answer": "The highest paid players are Nikola Jokic (C), Paul George (F), Norman Powell (G), Kevin Durant (PF), Stephen Curry (PG), LeBron James (SF), Bradley Beal (SG).", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "answer": "No, Jalen Johnson is 21 years old", "sql": "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "answer": "Spencer Dinwiddie, Dorian Finney-Smith, Royce O'Neale", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "answer": "Ja Morant", "sql": "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "answer": "Darius Garland", "sql": "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "answer": "Dereck Lively II", "sql": "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "answer": "$18,833,712", "sql": "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "answer": "24", "sql": "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "answer": "25", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What's the median age of the Miami Heat?", "answer": "26", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')/2;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "answer": "Golden State Warriors, Milwaukee Bucks, Miami Heat, LA Clippers, Phoenix Suns", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "answer": "$10948045", "sql": "select avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary from nba_roster where POS = 'PF';"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster;"}
+{"question": "What is the average salary of NBA players who are at least 5 years old", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 5"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster"}
+{"question": "What is the most common age range among NBA players", "sql": "SELECT AGE, COUNT(*) AS count FROM nba_roster GROUP BY AGE ORDER BY count DESC LIMIT 1"}
+{"question": "What is the median weight in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1"}
+{"question": "How many players in the NBA are at least 5 years older than the youngest player in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT MIN(AGE) FROM nba_roster) > 5"}
+{"question": "What are the 5 colleges that have produced the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC LIMIT 5"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE POS!= '--' GROUP BY POS ORDER BY count DESC"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE IS NOT NULL"}
+{"question": "What is the 99th percentile salary in the NBA?", "answer": "46741590", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "answer": "13932008", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "answer": "2413304", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "answer": "215", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average weight in the NBA?", "answer": "214.98", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "answer": "6.58333333333333", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average height in the NBA?", "answer": "6.54986111111111", "sql": "select AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height from nba_roster;"}
+{"question": "Can you tell me how many players are in the NBA?", "answer": "600", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "answer": "The highest paid players are Nikola Jokic (C), Paul George (F), Norman Powell (G), Kevin Durant (PF), Stephen Curry (PG), LeBron James (SF), Bradley Beal (SG).", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "answer": "No, Jalen Johnson is 21 years old", "sql": "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "answer": "Spencer Dinwiddie, Dorian Finney-Smith, Royce O'Neale", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "answer": "Ja Morant", "sql": "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "answer": "Darius Garland", "sql": "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "answer": "Dereck Lively II", "sql": "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "answer": "$18,833,712", "sql": "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "answer": "24", "sql": "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "answer": "25", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What's the median age of the Miami Heat?", "answer": "26", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')/2;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "answer": "Golden State Warriors, Milwaukee Bucks, Miami Heat, LA Clippers, Phoenix Suns", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "answer": "$10948045", "sql": "select avg(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary from nba_roster where POS = 'PF';"}

Разница между файлами не показана из-за своего большого размера
+ 1149 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_large.jsonl


+ 330 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_large_filtered.jsonl

@@ -0,0 +1,330 @@
+{"question": "What college has the most players in the NBA who are 30 years old or older", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE AGE >= 30 GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the total salary of all NBA players", "sql": "SELECT SUM(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)*1000000) FROM nba_roster;"}
+{"question": "What are the most common positions in the NBA", "sql": "SELECT POS, COUNT(*) AS num_players FROM nba_roster GROUP BY POS;"}
+{"question": "What is the average salary for each age group in the NBA", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary, AGE as age_group FROM nba_roster WHERE SALARY!= '--' GROUP BY AGE ORDER BY age_group;"}
+{"question": "What are the top 5 colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5;"}
+{"question": "How many players in the NBA attended college", "sql": "SELECT COUNT(*) AS num_college_players FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "What are the top 3 colleges with the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 3;"}
+{"question": "What is the average age of all players in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster;"}
+{"question": "What is the most represented college in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "Which college has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average height of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) AS average_height FROM nba_roster;"}
+{"question": "What is the average age of players on each team in the NBA", "sql": "SELECT team, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY team;"}
+{"question": "What are the top 3 positions with the highest total salary expenditure in the NBA", "sql": "SELECT pos, name, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY pos ORDER BY total_salary DESC LIMIT 3;"}
+{"question": "Which colleges have the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "What is the average salary for each team in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team;"}
+{"question": "What are the teams with the highest average salaries in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC;"}
+{"question": "What are the 5 colleges that have produced the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC LIMIT 5;"}
+{"question": "What is the most common position in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average salary of Power Forwards in the NBA who are at least 25 years old", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE >= 25 AND POS = 'PF';"}
+{"question": "What is the average age of 6-foot Power Forwards in the NBA", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) = 6 AND POS='PF';"}
+{"question": "What is the name of the player with the highest average weight among Power Forwards in the NBA", "sql": "SELECT NAME, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) AS INTEGER)) AS avg_weight FROM nba_roster WHERE POS='PF' GROUP BY NAME ORDER BY avg_weight DESC LIMIT 1;"}
+{"question": "Which team has the most players in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the average salary of all NBA players", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster;"}
+{"question": "What is the average age of the players on the Toronto Raptors", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "Which three teams have the most players from a single college", "sql": "SELECT team, COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY team, COLLEGE ORDER BY num_players DESC LIMIT 3;"}
+{"question": "How many players in the NBA are at least 5 years older than the youngest player in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT MIN(AGE) FROM nba_roster) > 5;"}
+{"question": "What is the average salary of NBA players who are 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$') - 1) as INTEGER)) FROM nba_roster WHERE CAST(AGE as INTEGER) >= 25;"}
+{"question": "What is the number of players on each team in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team;"}
+{"question": "What is the average salary for each position in the NBA, excluding players with unknown salaries", "sql": "SELECT POS, AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$') - 1) as INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "What are the oldest players on each team with a roster size of 6 or more", "sql": "SELECT NAME FROM nba_roster WHERE AGE IN (SELECT MAX(AGE) FROM nba_roster WHERE TEAM IN (SELECT TEAM FROM nba_roster GROUP BY TEAM HAVING COUNT(*) > 5));"}
+{"question": "What is the average height of the players on the Toronto Raptors", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the highest-paid Toronto Raptors player who attended college", "sql": "SELECT name, salary FROM nba_roster WHERE team='Toronto Raptors' AND COLLEGE!='--' AND SALARY!='--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the median weight in the NBA", "sql": "SELECT NAME, COLLEGE, COUNT(*) as num_colleges FROM nba_roster WHERE COLLEGE!= '--' GROUP BY NAME, COLLEGE ORDER BY num_colleges DESC;"}
+{"question": "Who are the top 5 highest-paid players in the NBA", "sql": "SELECT * FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5;"}
+{"question": "What is the average height of players on each NBA team", "sql": "SELECT team, AVG(CAST(SUBSTRING(HT, 1, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTRING(HT, INSTR(HT,'')+1) AS INTEGER) / 12.0) as avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY team;"}
+{"question": "Who are the top 3 highest-paid players in the NBA", "sql": "SELECT name, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY name ORDER BY total_salary DESC LIMIT 3;"}
+{"question": "What is the average salary of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "How many players in the NBA are 68 inches tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 68;"}
+{"question": "What are the top 5 teams with the oldest average age of players", "sql": "SELECT team, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 5;"}
+{"question": "What is the average salary of the Los Angeles Lakers players", "sql": "SELECT AVG(CAST(SALARY AS INTEGER) ) AS average_salary FROM nba_roster WHERE team='Los Angeles Lakers';"}
+{"question": "What is the college that has produced the most players currently playing for the Boston Celtics", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE team='Boston Celtics' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the most common position for players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE <= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average height of players on each NBA team, excluding players with unknown heights", "sql": "SELECT TEAM, AVG(CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER)) as avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY TEAM ORDER BY avg_height DESC;"}
+{"question": "What are the 5 most common heights among NBA players", "sql": "SELECT HT, COUNT(*) AS count FROM nba_roster GROUP BY HT ORDER BY count DESC LIMIT 5;"}
+{"question": "What are the top 5 colleges with the most players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 5;"}
+{"question": "What is the average height of NBA players who are 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE age >= 25;"}
+{"question": "What are the top 3 teams with the highest average salaries in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 3;"}
+{"question": "What is the position with the most players in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE SALARY!= '--' GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average salary of NBA players who are at least 5 years old", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 5;"}
+{"question": "What is the most common age range among NBA players", "sql": "SELECT AGE, COUNT(*) AS count FROM nba_roster GROUP BY AGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average salary of NBA players who are 25 years old or older", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE AGE > 25;"}
+{"question": "What is the average age of the players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE + (SELECT AVG(AGE) FROM nba_roster) > 5*12;"}
+{"question": "What is the average age of the players in the NBA who are older than 5 years old", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE AGE > 5*12;"}
+{"question": "What colleges have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "Who is the highest paid player in the NBA", "sql": "SELECT name, salary FROM nba_roster WHERE salary!= '--' ORDER BY CAST(REPLACE(REPLACE(salary, '$', ''), ',', '') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How many players in the NBA are 5 years or younger than the oldest player in the league", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE + 5 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What are the 5 teams with the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC;"}
+{"question": "What is the average salary for each team in the NBA, excluding teams with unknown salaries", "sql": "SELECT TEAM, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY TEAM ORDER BY average_salary DESC;"}
+{"question": "How many players in the NBA are 10 years old or older", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE age + (JULIANDAY('now') - JULIANDAY(DATE('now', '-10 year'))) / 365.25 >= 10;"}
+{"question": "How many players on the Toronto Raptors are 6 feet 8 inches tall", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE team='Toronto Raptors' AND CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = '6' || '8';"}
+{"question": "How many players in the NBA are over the age of 25", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE > 25;"}
+{"question": "What is the average height of NBA players under the age of 25", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What is the total salary of all players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE (AGE - (SELECT AVG(AGE) FROM nba_roster)) > 5;"}
+{"question": "What is the most common height in the NBA", "sql": "SELECT SUBSTR(HT, 1, INSTR(HT,'')-1) as height, COUNT(*) as count FROM nba_roster GROUP BY SUBSTR(HT, 1, INSTR(HT,'')-1) ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average salary of NBA players 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) FROM nba_roster WHERE AGE >= 25;"}
+{"question": "What are the 5 most common heights in the NBA", "sql": "SELECT HT, COUNT(*) AS frequency FROM nba_roster GROUP BY HT ORDER BY frequency DESC LIMIT 5;"}
+{"question": "What is the average height of the players on the Los Angeles Lakers", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,'')+1) AS FLOAT)/12) AS height FROM nba_roster WHERE TEAM = 'Los Angeles Lakers';"}
+{"question": "What is the average age of all players in the NBA who are older than 5 years old", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE > 5;"}
+{"question": "What is the most popular college attended by NBA players", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average height for each position in the NBA", "sql": "SELECT POS, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) as INTEGER)) AS average_height FROM nba_roster GROUP BY POS ORDER BY average_height;"}
+{"question": "What are the jersey numbers of the first 5 players in the NBA roster", "sql": "SELECT NAME, JERSEY FROM nba_roster ORDER BY JERSEY LIMIT 5;"}
+{"question": "What is the age range of the players in the NBA", "sql": "SELECT MIN(AGE) as youngest_player, MAX(AGE) as oldest_player FROM nba_roster;"}
+{"question": "What is the total salary for each team in the NBA", "sql": "SELECT team, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team;"}
+{"question": "What are the top 5 teams in the NBA with the highest average salary", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 5;"}
+{"question": "What are the top 5 highest-paid players in the NBA", "sql": "SELECT NAME, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as total_salary FROM nba_roster WHERE SALARY!= '--' ORDER BY total_salary DESC LIMIT 5;"}
+{"question": "What is the 99th percentile salary in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "How many players are on the Toronto Raptors", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE TEAM = 'Toronto Raptors';"}
+{"question": "What are the 5 highest-paid players in the NBA", "sql": "SELECT * FROM (SELECT *, ROW_NUMBER() OVER (ORDER BY SALARY DESC) AS row_num FROM nba_roster) AS temp_table WHERE row_num <= 5;"}
+{"question": "Which players have had the most varied careers in the NBA, having played for the most different teams", "sql": "SELECT name, COUNT(DISTINCT team) as num_teams FROM nba_roster WHERE team!= 'NA' GROUP BY name ORDER BY num_teams DESC LIMIT 10;"}
+{"question": "Which three teams have the most players under the age of 25", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster WHERE AGE < 25 GROUP BY Team ORDER BY num_players DESC LIMIT 3;"}
+{"question": "What are the colleges with the highest average salaries in the NBA", "sql": "SELECT college, COUNT(*) as count, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY college ORDER BY avg_salary DESC;"}
+{"question": "What is the name and jersey number of the player with the highest jersey number in the NBA", "sql": "SELECT NAME, JERSEY FROM nba_roster WHERE JERSEY!= 'NA' ORDER BY CAST(JERSEY AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average age of NBA players", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster;"}
+{"question": "What are the top 3 teams with the oldest average age in the NBA", "sql": "SELECT TEAM, AVG(AGE) as average_age FROM nba_roster WHERE SALARY!= '--' GROUP BY TEAM ORDER BY average_age DESC LIMIT 3;"}
+{"question": "Which colleges have multiple players in the NBA", "sql": "SELECT COUNT(*) AS college_players, COLLEGE FROM nba_roster GROUP BY COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "What is the average age of players on each NBA team", "sql": "SELECT team, AVG(CAST(AGE as INTEGER)) as avg_age FROM nba_roster GROUP BY team;"}
+{"question": "What is the average salary of Power Forward players in the NBA", "sql": "SELECT age, COUNT(*) as count FROM nba_roster GROUP BY age ORDER BY count DESC;"}
+{"question": "What is the team with the highest average salary for players over 25 years old", "sql": "SELECT team, AVG(CAST(SUBSTRING(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE > 25 AND SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "What is the age range of players in the NBA", "sql": "SELECT MIN(AGE) as youngest, MAX(AGE) as oldest FROM nba_roster;"}
+{"question": "What is the most successful college in terms of producing NBA players", "sql": "SELECT COLLEGE, COUNT(*) as frequency FROM nba_roster GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the average salary of the Boston Celtics players", "sql": "SELECT AVG(CAST(SALARY AS INTEGER) ) AS average_salary FROM nba_roster WHERE team='Boston Celtics';"}
+{"question": "Which colleges have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC;"}
+{"question": "Who is the highest-paid player in the NBA", "sql": "SELECT NAME FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster);"}
+{"question": "Which 5 players have the highest jersey numbers in the NBA", "sql": "SELECT name, jersey FROM nba_roster WHERE jersey!= 'NA' ORDER BY CAST(REPLACE(REPLACE(jersey, '0', ''), 'NA', '') AS INTEGER) DESC LIMIT 5;"}
+{"question": "What are the names of the players who are older than 30 years old in the NBA", "sql": "SELECT name, age FROM nba_roster WHERE age > 30 ORDER BY age;"}
+{"question": "How many players in the NBA are younger than the oldest player in the league by 25 years", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE + 25 > (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "Which 10 players have played for the most teams in their NBA careers", "sql": "SELECT name, COUNT(DISTINCT team) AS num_teams FROM nba_roster GROUP BY name ORDER BY num_teams DESC LIMIT 10;"}
+{"question": "What is the average height for each height range in the NBA", "sql": "SELECT HT, COUNT(*) as count, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) as avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY HT;"}
+{"question": "How many players in the NBA are 6 feet 8 inches tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = 68;"}
+{"question": "What percentage of players in the NBA are 10 years or less away from the oldest player in the league", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE + 10 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What is the college that has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS count FROM nba_roster GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the average salary of the youngest players on each NBA team", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE <= 22 GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "What is the average age of players in the NBA who have a publicly disclosed salary", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the average salary for each position in the NBA", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary, POS FROM nba_roster WHERE SALARY!= '--' GROUP BY POS ORDER BY average_salary DESC;"}
+{"question": "What is the average age of players in the NBA who are at least 60 years old", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE > 5*12;"}
+{"question": "Who are the 10 tallest players in the NBA", "sql": "SELECT HT, NAME FROM nba_roster ORDER BY CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) DESC LIMIT 10;"}
+{"question": "Which NBA team has the most players under the age of 25", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE <= 25 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the average age of players from each college, excluding those who did not attend college, listed in order from oldest to youngest", "sql": "SELECT COLLEGE, AVG(AGE) as average_age FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY average_age DESC;"}
+{"question": "What is the average salary for each position in the NBA, with the highest-paid positions listed first", "sql": "SELECT POS, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS ORDER BY average_salary DESC;"}
+{"question": "What is the average height of NBA players 25 years old or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster WHERE AGE >= 25;"}
+{"question": "What are the top 10 colleges with the most players in the NBA", "sql": "SELECT college, COUNT(*) as num_players FROM nba_roster WHERE college!= '--' GROUP BY college ORDER BY num_players DESC LIMIT 10;"}
+{"question": "What is the average height of all players in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as average_height FROM nba_roster;"}
+{"question": "What are the top 5 colleges that produce the highest-paid NBA players", "sql": "SELECT COLLEGE, AVG(CAST(SUBSTR(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY COLLEGE ORDER BY average_salary DESC LIMIT 5;"}
+{"question": "Which teams have the most players under 6'8", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE HT!= 'NA' AND CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) < 68 GROUP BY team;"}
+{"question": "What is the number of players in the NBA who are 25 years old or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What is the team with the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "What are the average heights for each position in the NBA, from tallest to shortest", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as average_height, POS FROM nba_roster GROUP BY POS ORDER BY average_height DESC;"}
+{"question": "How many players in the NBA are over the age of 30", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE > 30;"}
+{"question": "Who is the tallest player in the NBA", "sql": "SELECT NAME, HT FROM nba_roster ORDER BY LENGTH(HT) DESC LIMIT 1;"}
+{"question": "What are the top 3 teams in the NBA with the highest average salary", "sql": "SELECT team, AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$') - 1) AS INTEGER)) AS avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 3;"}
+{"question": "Which team has the highest average salary in the NBA", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY average_salary DESC LIMIT 1;"}
+{"question": "What is the total number of players in the NBA who have attended a college other than '--'?", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "Who is the player who has played for the most teams in their NBA career", "sql": "SELECT NAME, COUNT(DISTINCT TEAM) AS num_teams FROM nba_roster WHERE SALARY!= '--' GROUP BY NAME ORDER BY num_teams DESC LIMIT 1;"}
+{"question": "What are the top 10 highest-paid college-educated players in the NBA", "sql": "SELECT name, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS total_salary FROM nba_roster WHERE COLLEGE!= '--' GROUP BY name ORDER BY total_salary DESC LIMIT 10;"}
+{"question": "Which NBA players have attended multiple colleges", "sql": "SELECT NAME, COLLEGE FROM nba_roster WHERE COLLEGE!= '--' GROUP BY NAME, COLLEGE HAVING COUNT(COLLEGE) > 1;"}
+{"question": "What are the 5 teams with the tallest average height in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) AS average_height FROM nba_roster GROUP BY team ORDER BY average_height DESC;"}
+{"question": "What is the average height of players in the NBA who are older than 25 years old", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) AS average_height FROM nba_roster WHERE AGE > 25;"}
+{"question": "How many players are on the Toronto Raptors' roster", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the weight of the heaviest 75% of NBA players", "sql": "SELECT WT FROM nba_roster ORDER BY CAST(REPLACE(REPLACE(WT,'lbs', ''),'', '') AS INTEGER) DESC LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE WT!= '--') * 75 / 100 - 1;"}
+{"question": "Who is the highest-paid player in the league, excluding those with unknown positions, salaries, or colleges", "sql": "SELECT name, salary FROM nba_roster WHERE POS!= 'NA' AND SALARY!= '--' AND COLLEGE!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How many players in the NBA attended Duke, Kentucky, or North Carolina for college", "sql": "SELECT COUNT(*) AS count FROM nba_roster WHERE COLLEGE!= '--' AND COLLEGE IN ('Duke', 'Kentucky', 'North Carolina');"}
+{"question": "What is the most common college represented in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as frequency FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who attended a college other than '--'?", "sql": "SELECT COUNT(*) FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "How many players on the Toronto Raptors are 25 years old or older", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE team='Toronto Raptors' AND AGE >= 25;"}
+{"question": "How many players on the Toronto Raptors are 6'8", "sql": "SELECT COUNT(*) FROM nba_roster WHERE TEAM = 'Toronto Raptors' AND CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = '6' || '8';"}
+{"question": "What is the team with the most players over 30 years old in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster WHERE AGE > 30 GROUP BY Team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What is the highest-paid Power Forward in the NBA", "sql": "SELECT POS, NAME, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as Salary FROM nba_roster WHERE SALARY!= '--' ORDER BY Salary DESC LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE SALARY!= '--' AND POS = 'PF')-1;"}
+{"question": "How many players in the NBA are older than the average age of all players", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster);"}
+{"question": "What positions in the NBA tend to have the oldest average age", "sql": "SELECT POS, COUNT(*) AS count, AVG(AGE) AS average_age FROM nba_roster GROUP BY POS ORDER BY average_age DESC;"}
+{"question": "Which players have more than 5 teammates with the same name", "sql": "SELECT NAME FROM nba_roster WHERE (SELECT COUNT(*) FROM nba_roster WHERE NAME = nba_roster.NAME AND TEAM = nba_roster.TEAM) > 5;"}
+{"question": "Which teams have the most players in the NBA", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team ORDER BY num_players DESC;"}
+{"question": "What is the total salary of the most expensive team in the NBA", "sql": "SELECT Team, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as Total_Salary FROM nba_roster WHERE SALARY!= '--' GROUP BY Team ORDER BY Total_Salary DESC;"}
+{"question": "How many players on the Boston Celtics are 6 feet 8 inches tall or taller", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE team='Boston Celtics' AND CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = '6' || '8';"}
+{"question": "What are the most common colleges represented in the NBA", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster GROUP BY COLLEGE ORDER BY count DESC;"}
+{"question": "What are the 5 teams with the oldest average age in the NBA", "sql": "SELECT team, AVG(AGE) AS average_age, COUNT(*) AS num_players FROM nba_roster GROUP BY team HAVING COUNT(*) > 5 ORDER BY average_age DESC;"}
+{"question": "How many players in the NBA are 6 feet tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 6;"}
+{"question": "Who are the tallest players in the NBA", "sql": "SELECT NAME FROM nba_roster WHERE HT > (SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) FROM nba_roster);"}
+{"question": "What are the ages of the youngest and oldest players in the NBA", "sql": "SELECT MIN(AGE) AS youngest_player, MAX(AGE) AS oldest_player FROM nba_roster;"}
+{"question": "What are the 5 teams with the lightest average weight for players with known heights", "sql": "SELECT HT, WT, AVG(CAST(SUBSTR(WT, 1, LENGTH(WT)-3) AS INTEGER)) AS avg_weight FROM nba_roster WHERE HT!= 'NA' GROUP BY HT ORDER BY avg_weight DESC LIMIT 5;"}
+{"question": "What are the top 5 positions with the tallest average height in the NBA", "sql": "SELECT POS, COUNT(*) AS count, AVG(CAST(SUBSTR(HT, 1, LENGTH(HT)-2) AS INTEGER)) AS avg_height FROM nba_roster WHERE HT!= 'NA' GROUP BY POS ORDER BY count DESC LIMIT 5;"}
+{"question": "Which 5 players have played for the most teams in their NBA careers", "sql": "SELECT NAME, COUNT(DISTINCT team) AS num_teams FROM nba_roster GROUP BY NAME ORDER BY num_teams DESC LIMIT 5;"}
+{"question": "What are the most common heights in the NBA", "sql": "SELECT HT, COUNT(*) as count FROM nba_roster GROUP BY HT ORDER BY count DESC LIMIT 10;"}
+{"question": "How many players on the Los Angeles Lakers are 6 feet 8 inches tall", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE team='Los Angeles Lakers' AND CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = '6' || '8';"}
+{"question": "What are the most common positions for players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE < 25 GROUP BY POS ORDER BY count DESC;"}
+{"question": "What are the top colleges that produce the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) as count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY count DESC;"}
+{"question": "What are the colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "How many players in the NBA are 25 years or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE + 25 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What is the average age of players from the college that has produced the youngest players in the NBA", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY average_age LIMIT 1;"}
+{"question": "How many players in the NBA have attended Duke, Kentucky, North Carolina, or did not attend college", "sql": "SELECT COUNT(*) FROM nba_roster WHERE COLLEGE IN ('--', 'Duke', 'Kentucky', 'North Carolina');"}
+{"question": "What are the teams with the most players from a particular college", "sql": "SELECT team, COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY team, COLLEGE ORDER BY num_players DESC;"}
+{"question": "What is the number of players in the NBA who are older than 10 years old", "sql": "SELECT COUNT(*) FROM nba_roster WHERE (CAST(CAST(AGE AS INTEGER) AS REAL) > 10);"}
+{"question": "What are the top 3 highest paid players from each college", "sql": "SELECT name, college, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY college ORDER BY max_salary DESC LIMIT 3;"}
+{"question": "How many players in the NBA are at least 6 feet 8 inches tall", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) >= 68;"}
+{"question": "Which NBA teams have the most players from a particular college", "sql": "SELECT Team, COLLEGE, COUNT(*) as Count FROM nba_roster WHERE COLLEGE!= '--' GROUP BY Team, COLLEGE ORDER BY Count DESC;"}
+{"question": "What is the most common college attended by NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS frequency FROM nba_roster GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the total salary of all NBA players, excluding those with unknown salaries", "sql": "SELECT SUM(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)*1000000) AS total_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What are the teams with the tallest average height in the NBA", "sql": "SELECT team, AVG(LENGTH(HT)) AS average_height FROM nba_roster GROUP BY team ORDER BY average_height DESC;"}
+{"question": "Which 10 players have played for the most teams in their NBA career", "sql": "SELECT name, COUNT(DISTINCT team) as num_teams FROM nba_roster WHERE SALARY!= '--' GROUP BY name ORDER BY num_teams DESC LIMIT 10;"}
+{"question": "What is the average height of NBA players 25 years old or younger", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) AS INTEGER)) AS average_height FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What is the average weight of NBA players", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')-1) AS INTEGER) + CAST(SUBSTR(WT, INSTR(WT,' ')+1) AS FLOAT)/16) as average_weight FROM nba_roster WHERE WT!= '--';"}
+{"question": "Which teams in the NBA have a significantly larger roster size compared to the number of point guards in the league", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster GROUP BY Team HAVING COUNT(*) > (SELECT COUNT(*) FROM nba_roster WHERE POS = 'PG')*0.3;"}
+{"question": "What are the top 5 colleges that produce the oldest average age of NBA players", "sql": "SELECT COLLEGE, AVG(AGE) as avg_age FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY avg_age DESC LIMIT 5;"}
+{"question": "What is the average salary of all players in the positions of PG, SG, SF, PF, and C in the NBA", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE POS = 'PG' OR POS = 'SG' OR POS = 'SF' OR POS = 'PF' OR POS = 'C';"}
+{"question": "Who is the player with the highest salary in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster);"}
+{"question": "What are the top 10 teams with the most players in the NBA, considering only teams with at least 10 players with height information", "sql": "SELECT name, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height, COUNT(*) as count FROM nba_roster WHERE HT!= 'NA' GROUP BY name ORDER BY count DESC LIMIT 10;"}
+{"question": "Which players have played for the most teams in their NBA careers", "sql": "SELECT name, COUNT(DISTINCT team) as team_count FROM nba_roster WHERE team!= 'NA' GROUP BY name ORDER BY team_count DESC LIMIT 10;"}
+{"question": "What is the 75th percentile jersey number in the NBA", "sql": "SELECT CAST(Jersey AS INTEGER) as percentile FROM nba_roster ORDER BY CAST(Jersey AS INTEGER) LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster) * 0.75;"}
+{"question": "How many players in the NBA are younger than the oldest player in the league by 15 years", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE + 15 > (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "Which jersey numbers are the most popular among NBA players", "sql": "SELECT NAME, JERSEY FROM nba_roster GROUP BY JERSEY ORDER BY COUNT(*) DESC LIMIT 3;"}
+{"question": "Which team has the highest average salary", "sql": "SELECT team, AVG(CAST(SUBSTRING(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) AS avg_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_salary DESC LIMIT 1;"}
+{"question": "How many players in the NBA are older than 25 years old", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE > 25;"}
+{"question": "Which colleges have produced the most multiple NBA players", "sql": "SELECT COLLEGE, COUNT(*) FROM nba_roster GROUP BY COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "Who has the highest salary on the Los Angeles Lakers", "sql": "SELECT name, salary FROM nba_roster WHERE team='Los Angeles Lakers' AND salary!= '--' ORDER BY CAST(REPLACE(REPLACE(salary, '$', ''), ',', '') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What are the minimum and maximum salaries for each team in the NBA", "sql": "SELECT MIN(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as min_salary, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary, team FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY min_salary DESC, max_salary DESC;"}
+{"question": "What is the average age of the team with the oldest roster in the NBA", "sql": "SELECT AVG(AGE) as avg_age FROM nba_roster GROUP BY team ORDER BY avg_age DESC LIMIT 1;"}
+{"question": "What are the teams with more than 5 players in the age range of 25 to 30 in the NBA", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE AGE BETWEEN 25 AND 30 GROUP BY team HAVING COUNT(*) > 5;"}
+{"question": "Who is the highest-paid player who did not attend college", "sql": "SELECT name, salary FROM nba_roster WHERE SALARY!= '--' AND COLLEGE = '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the total number of players in the NBA", "sql": "SELECT COUNT(*) FROM nba_roster;"}
+{"question": "What is the most common position among players under the age of 25 in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE <= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "Who is the oldest player in the NBA", "sql": "SELECT name, age FROM nba_roster ORDER BY age DESC LIMIT 1;"}
+{"question": "What are the minimum and maximum salaries in the NBA", "sql": "SELECT MIN(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as min_salary, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the average salary of Power Forward players in the NBA who are under the age of 25", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE POS = 'PF' AND AGE < 25;"}
+{"question": "What is the total number of players in the NBA who are 25 years or younger", "sql": "SELECT COUNT(*) as total_players FROM nba_roster WHERE AGE + 25 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "Who is the highest-paid player on the Toronto Raptors", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE TEAM = 'Toronto Raptors' AND SALARY = (SELECT MAX(SALARY) FROM nba_roster WHERE TEAM = 'Toronto Raptors');"}
+{"question": "Who is the highest-paid player on the Los Angeles Lakers", "sql": "SELECT name FROM nba_roster WHERE team='Los Angeles Lakers' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What are the top 3 teams with the most players over the age of 5 in the NBA", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE age > 5 GROUP BY team ORDER BY num_players DESC LIMIT 3;"}
+{"question": "Which teams have the tallest players, excluding those with unknown salaries", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY avg_height DESC;"}
+{"question": "What is the number of players in the NBA who are 25 years or younger", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE + 25 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What is the age group with the most players in the NBA", "sql": "SELECT AGE, COUNT(*) as count FROM nba_roster GROUP BY AGE ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the most common position for players aged 25 or older in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE AGE >= 25 GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the total salary of all players in the NBA", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster;"}
+{"question": "Which three teams have the most players from the same college", "sql": "SELECT team, COUNT(*) AS num_players, COLLEGE FROM nba_roster GROUP BY team, COLLEGE ORDER BY num_players DESC LIMIT 3;"}
+{"question": "What is the average age of players in the NBA who are more than 5 years older than the average age of all players", "sql": "SELECT AVG(AGE) as average_age FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5;"}
+{"question": "What is the heaviest player in the NBA", "sql": "SELECT NAME, WT FROM nba_roster WHERE WT!= 'NA' ORDER BY CAST(SUBSTRING(WT, 0, INSTR(WT,'') - 1) AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average height of all players in the NBA roster", "sql": "SELECT AVG(LENGTH(HT)) AS average_height FROM nba_roster;"}
+{"question": "What are the average height and age of players on each team in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) AS average_height, AVG(AGE) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC;"}
+{"question": "How many players in the NBA are 6' or 8' tall", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE CAST(SUBSTRING(HT, 1, INSTR(HT,'')-1) AS INTEGER) = 6 | 8;"}
+{"question": "What is the shortest weight listed in the 'nba_roster' table", "sql": "SELECT NAME, WT FROM nba_roster ORDER BY LENGTH(WT) LIMIT 1;"}
+{"question": "What is the highest-paid player in the NBA", "sql": "SELECT TEAM, NAME, SALARY FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster) ORDER BY TEAM;"}
+{"question": "What college has produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS frequency FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the total salary of all players in the NBA who are 25 years old or younger", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What is the oldest player for each position in the NBA", "sql": "SELECT pos, NAME, MAX(AGE) as max_age FROM nba_roster GROUP BY pos;"}
+{"question": "Who is the highest-paid player in the NBA who did not attend college", "sql": "SELECT name, salary FROM nba_roster WHERE COLLEGE = '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the salary of the 25th percentile of players in the NBA who are 25 years old or younger", "sql": "SELECT CAST(SALARY as INTEGER) as percentile FROM nba_roster WHERE AGE <= 25 ORDER BY percentile LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster WHERE AGE <= 25) / 4;"}
+{"question": "What are the most common positions in the NBA, and which position has the highest average weight", "sql": "SELECT POS, COUNT(*) AS count, AVG(CAST(SUBSTR(WT, 1, INSTR(WT,'')) AS INTEGER)) AS average_weight FROM nba_roster WHERE POS!= 'NA' GROUP BY POS ORDER BY count DESC;"}
+{"question": "What is the 75th percentile age of the NBA players", "sql": "SELECT CAST(AGE AS INTEGER) AS percentile FROM nba_roster ORDER BY percentile LIMIT 1 OFFSET (SELECT COUNT(*) FROM nba_roster) * 0.75;"}
+{"question": "What is the average salary of paid NBA players", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY,' ')-1) AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What age group has the most players in the NBA", "sql": "SELECT AGE, COUNT(*) as count FROM nba_roster GROUP BY AGE ORDER BY count DESC;"}
+{"question": "What is the height of the tallest player on the Los Angeles Lakers", "sql": "SELECT HT, NAME FROM nba_roster WHERE team='Los Angeles Lakers' AND HT!= 'NA' ORDER BY CAST(SUBSTRING(HT, 0, INSTR(HT,'')) AS INTEGER) DESC LIMIT 1;"}
+{"question": "What is the average salary of the Toronto Raptors players", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster WHERE team='Toronto Raptors';"}
+{"question": "What is the average salary of an NBA player", "sql": "SELECT AVG(CAST(SALARY AS INTEGER) / 1000000) AS average_salary FROM nba_roster;"}
+{"question": "What is the team with the highest average age in the NBA", "sql": "SELECT team, AVG(age) AS average_age FROM nba_roster GROUP BY team ORDER BY average_age DESC LIMIT 1;"}
+{"question": "Which team has the most players over the age of 25 in the NBA", "sql": "SELECT Team, COUNT(*) FROM nba_roster WHERE AGE > 25 GROUP BY Team ORDER BY COUNT(*) DESC LIMIT 1;"}
+{"question": "What is the total salary of the team with the highest total salary in the NBA", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS total_salary, team FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY total_salary DESC;"}
+{"question": "How many players in the NBA are exactly 6 feet tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) = 6 AND HT!= 'NA';"}
+{"question": "What is the age with the most unique players in the NBA", "sql": "SELECT COUNT(DISTINCT AGE) AS age_count, AGE FROM nba_roster GROUP BY AGE ORDER BY age_count DESC LIMIT 1;"}
+{"question": "What is the highest-paid player who did not attend college", "sql": "SELECT name, salary FROM nba_roster WHERE COLLEGE = '--' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which age group has the most players in the NBA", "sql": "SELECT COUNT(*), AGE FROM nba_roster GROUP BY AGE ORDER BY COUNT(*) DESC;"}
+{"question": "What is the average height in the NBA?", "sql": "SELECT COUNT(*) as num_college_players FROM nba_roster WHERE COLLEGE!= '--';"}
+{"question": "Which position has the most players in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What are the top 10 colleges that have produced the most NBA players", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC LIMIT 10;"}
+{"question": "What is the average age of players from colleges that have multiple players in the NBA", "sql": "SELECT AVG(AGE) AS average_age, COLLEGE FROM nba_roster GROUP BY COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "Which colleges have the most representation in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE ORDER BY num_players DESC;"}
+{"question": "Who are the oldest players in the NBA, excluding those who are above the average age of all players", "sql": "SELECT NAME FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster) ORDER BY AGE DESC;"}
+{"question": "What are the top 3 highest-paid players on the Toronto Raptors", "sql": "SELECT name, SALARY FROM nba_roster WHERE team='Toronto Raptors' ORDER BY CAST(SUBSTRING(SALARY, 2) AS INTEGER) DESC LIMIT 3;"}
+{"question": "Which colleges have produced multiple players in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster GROUP BY COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "What is the average salary of NBA players 25 years old or younger", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) AS average_salary FROM nba_roster WHERE CAST(AGE AS INTEGER) <= 25;"}
+{"question": "What is the highest-paid player who has played for more than one team", "sql": "SELECT NAME, TEAM, SALARY FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster) AND (SELECT COUNT(DISTINCT TEAM) FROM nba_roster WHERE NAME = nba_roster.NAME) > 1;"}
+{"question": "Who is the tallest player in the NBA, based on average height", "sql": "SELECT NAME, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) AS INTEGER)) AS AVG_HEIGHT, COUNT(DISTINCT TEAM) AS TEAM_COUNT FROM nba_roster GROUP BY NAME ORDER BY AVG_HEIGHT DESC LIMIT 1;"}
+{"question": "What is the total weight of all players in the NBA", "sql": "SELECT SUM(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What are the top 10 highest-paid teams in the NBA, based on the average salary of their players", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) as avg_salary, AVG(AGE) as avg_age FROM nba_roster WHERE SALARY!= '--' GROUP BY SALARY ORDER BY avg_salary DESC LIMIT 10;"}
+{"question": "What is the highest salary for each team in the NBA", "sql": "SELECT team, MAX(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) as highest_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team;"}
+{"question": "What is the average age of all players in the NBA who are at least 60 years old", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster WHERE AGE > 5*12;"}
+{"question": "What is the average age of the youngest players in the NBA", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What are the top 3 teams with the highest average salary", "sql": "SELECT team, AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) AS average_salary FROM nba_roster GROUP BY team ORDER BY average_salary DESC LIMIT 3;"}
+{"question": "What is the most popular jersey number in the NBA", "sql": "SELECT Jersey, COUNT(*) as frequency FROM nba_roster WHERE Jersey!= 'NA' GROUP BY Jersey ORDER BY frequency DESC LIMIT 1;"}
+{"question": "What is the total salary of all players in the NBA, excluding those with unknown salaries", "sql": "SELECT SUM(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the number of players in the NBA roster who are 10 years or less away from the oldest player in the league", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE + 10 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "Which three teams have the tallest average height in the NBA", "sql": "SELECT team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) as height FROM nba_roster GROUP BY team ORDER BY height DESC LIMIT 3;"}
+{"question": "How many players in the NBA are older than 5 years old", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE AGE > 5;"}
+{"question": "What are the 5 teams with the most players from the University of Michigan", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE = 'Michigan' GROUP BY team ORDER BY num_players DESC LIMIT 5;"}
+{"question": "What is the number of players in the NBA who are 15 years or younger than the oldest player in the league", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE + 15 <= (SELECT MAX(AGE) FROM nba_roster);"}
+{"question": "What are the minimum and maximum salaries of NBA players", "sql": "SELECT MIN(SALARY) AS min_salary, MAX(SALARY) AS max_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the total salary of all players on the Toronto Raptors who are at least 6 feet 7 inches tall", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE team='Toronto Raptors' AND CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.67;"}
+{"question": "What is the height with the most players in the NBA", "sql": "SELECT HT, COUNT(*) as count, AVG(WT) as avg_weight FROM nba_roster GROUP BY HT ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the most common height of NBA players", "sql": "SELECT CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) AS height, COUNT(*) AS count FROM nba_roster GROUP BY CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the total salary of all NBA players with known salaries", "sql": "SELECT SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the oldest player in the NBA", "sql": "SELECT AVG(AGE) as average_age, NAME from nba_roster GROUP BY NAME ORDER BY average_age DESC LIMIT 1;"}
+{"question": "What is the average height of NBA players aged 25 or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) as INTEGER)) AS avg_height FROM nba_roster WHERE AGE >= 25;"}
+{"question": "How many players in the NBA are 6'6", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) = '6' || '6';"}
+{"question": "Who are the oldest players on each team in the NBA, excluding the average age of their team", "sql": "SELECT nba_roster.NAME FROM nba_roster WHERE AGE > (SELECT AVG(AGE) FROM nba_roster WHERE TEAM = nba_roster.TEAM) ORDER BY AGE DESC;"}
+{"question": "What is the most common position played by Jalen Johnson", "sql": "SELECT POS, COUNT(*) AS count, POS FROM nba_roster GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the number of players on each team who are 25 years old or older", "sql": "SELECT team, COUNT(*) AS num_players FROM nba_roster WHERE AGE >= 25 GROUP BY team;"}
+{"question": "What are the top 5 players in the NBA in terms of average height", "sql": "SELECT name, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)) as avg_height FROM nba_roster GROUP BY name ORDER BY avg_height DESC LIMIT 5;"}
+{"question": "What players in the NBA are taller than the average height of all players", "sql": "SELECT NAME FROM nba_roster WHERE HT > (SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12) FROM nba_roster);"}
+{"question": "Which team has the most players 25 years old or older", "sql": "SELECT team, COUNT(*) as num_players FROM nba_roster WHERE AGE >= 25 GROUP BY team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What are the 5 most common jersey numbers in the NBA", "sql": "SELECT COUNT(DISTINCT Jersey), Jersey FROM nba_roster GROUP BY Jersey ORDER BY COUNT(DISTINCT Jersey) DESC LIMIT 5;"}
+{"question": "What colleges are most represented in the NBA", "sql": "SELECT COLLEGE, COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE!= '--' GROUP BY COLLEGE;"}
+{"question": "What is the average salary of NBA players under the age of 25", "sql": "SELECT AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) AS average_salary FROM nba_roster WHERE AGE <= 25;"}
+{"question": "What are the top 10 teams in the NBA by average salary", "sql": "SELECT Team, AVG(CAST(SUBSTRING(SALARY, 2, LENGTH(SALARY)-2) AS INTEGER)) AS average_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY Team ORDER BY average_salary DESC LIMIT 10;"}
+{"question": "What is the player with the highest salary in the NBA", "sql": "SELECT name, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) AS salary FROM nba_roster WHERE SALARY!= '--' ORDER BY salary DESC LIMIT 1;"}
+{"question": "Who is the oldest player in the NBA who is not a rookie", "sql": "SELECT name, age FROM nba_roster WHERE SALARY!= '--' ORDER BY age DESC LIMIT 1;"}
+{"question": "How many players in the NBA are 6 feet 8 inches or taller", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE CAST(SUBSTRING(HT, 0, INSTR(HT,'')-1) AS INTEGER) >= 68;"}
+{"question": "How many players in the NBA are 25 years old", "sql": "SELECT COUNT(*) FROM nba_roster WHERE age = 25;"}
+{"question": "What is the team with the oldest average age in the NBA", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster GROUP BY TEAM ORDER BY average_age DESC LIMIT 1;"}
+{"question": "Who is the highest-paid player in the NBA, excluding those with unknown salaries", "sql": "SELECT MAX(SALARY) AS highest_salary, NAME FROM nba_roster WHERE SALARY!= '--' GROUP BY NAME ORDER BY highest_salary DESC LIMIT 1;"}
+{"question": "Which team has the most players under the age of 25", "sql": "SELECT Team, COUNT(*) as num_players FROM nba_roster WHERE AGE < 25 GROUP BY Team ORDER BY num_players DESC LIMIT 1;"}
+{"question": "What are the top 3 jersey numbers with the most players in the NBA", "sql": "SELECT jersey, COUNT(*) as count FROM nba_roster WHERE jersey!= 'NA' GROUP BY jersey ORDER BY count DESC LIMIT 3;"}
+{"question": "What percentage of NBA players are at least 6 feet 8 inches tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER) >= 68 AND CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 >= 6.5;"}
+{"question": "What is the average age and height of NBA players, excluding those with unknown heights", "sql": "SELECT AVG(AGE) as average_age, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) as INTEGER)) as average_height FROM nba_roster WHERE HT!= 'NA';"}
+{"question": "What is the player who has played for the most teams in the NBA", "sql": "SELECT name, COUNT(*) as num_teams FROM nba_roster GROUP BY name ORDER BY num_teams DESC LIMIT 1;"}
+{"question": "What is the average height of players in the NBA who are 25 years or older", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')) as INTEGER)) AS avg_height FROM nba_roster WHERE CAST(AGE as INTEGER) >= 25;"}
+{"question": "Which team has the most unique players in the NBA", "sql": "SELECT COUNT(DISTINCT TEAM), TEAM FROM nba_roster GROUP BY TEAM ORDER BY COUNT(DISTINCT TEAM) DESC LIMIT 1;"}
+{"question": "What are the 5 oldest players in the NBA", "sql": "SELECT NAME, AGE FROM nba_roster ORDER BY AGE DESC LIMIT 5;"}
+{"question": "What is the shortest height of a player in the NBA", "sql": "SELECT name, HT FROM nba_roster ORDER BY LENGTH(HT) LIMIT 1, 1;"}
+{"question": "What is the average height of Power Forwards and Centers in the NBA", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER) + CAST(SUBSTR(HT, INSTR(HT,'')+1) AS FLOAT)/12) AS average_height FROM nba_roster WHERE POS IN ('PF', 'C');"}
+{"question": "What is the total salary of the team with the highest payroll in the NBA", "sql": "SELECT team, SUM(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as total_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY total_salary DESC;"}
+{"question": "What are the top-paid players for each team in the NBA", "sql": "SELECT team, name, CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) as salary FROM nba_roster WHERE SALARY!= '--' GROUP BY team ORDER BY salary DESC;"}
+{"question": "What is the average salary of all NBA players, excluding those with unknown salaries", "sql": "SELECT AVG(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as average_salary FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the name and jersey number of the player with the highest jersey number in the NBA roster", "sql": "SELECT NAME, JERSEY FROM nba_roster ORDER BY CAST(JERSEY AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which five jersey numbers are the most commonly worn by players in the NBA", "sql": "SELECT name, jersey, COUNT(*) as count FROM nba_roster GROUP BY jersey ORDER BY count DESC LIMIT 5;"}
+{"question": "What is the most popular position in the NBA", "sql": "SELECT POS, COUNT(*) AS count FROM nba_roster WHERE POS IN ('PG', 'SG', 'SF', 'PF', 'C') GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "What is the number of players in the NBA who are 68 inches tall", "sql": "SELECT COUNT(*) FROM nba_roster WHERE CAST(SUBSTRING(HT, 0, INSTR(HT,'')) AS INTEGER) = 68;"}
+{"question": "Who is the highest paid player on the team with the most players", "sql": "SELECT NAME FROM nba_roster WHERE SALARY = (SELECT MAX(SALARY) FROM nba_roster) AND TEAM = (SELECT TEAM FROM nba_roster GROUP BY TEAM ORDER BY COUNT(*) DESC LIMIT 1);"}
+{"question": "What is the average height of players on the Toronto Raptors", "sql": "SELECT Team, AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) as Average_Height FROM nba_roster GROUP BY Team;"}
+{"question": "What are the top 5 teams in the NBA by average salary", "sql": "SELECT Team, AVG(CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER)) as Average_Salary FROM nba_roster WHERE SALARY!= '--' GROUP BY Team ORDER BY Average_Salary DESC;"}
+{"question": "What are the top 3 highest-paid players in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE SALARY!= '--' ORDER BY CAST(SUBSTR(SALARY, 1, INSTR(SALARY, '$')-1) AS INTEGER) DESC LIMIT 3;"}
+{"question": "How many players in the NBA are 25 years old or younger", "sql": "SELECT COUNT(*) FROM nba_roster WHERE AGE <= 25;"}
+{"question": "How many players in the NBA attended Michigan State University", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE COLLEGE = 'Michigan State';"}
+{"question": "What is the most represented position among University of Michigan alumni in the NBA", "sql": "SELECT POS, COUNT(*) as count FROM nba_roster WHERE COLLEGE='Michigan' GROUP BY POS ORDER BY count DESC LIMIT 1;"}
+{"question": "How many players are on each team in the NBA", "sql": "SELECT TEAM, COUNT(*) AS num_players FROM nba_roster GROUP BY TEAM;"}
+{"question": "What is the number of players in the NBA roster who are more than 5 years older than the average age of all players in the roster", "sql": "SELECT COUNT(*) as num_players FROM nba_roster WHERE AGE - (SELECT AVG(AGE) FROM nba_roster) > 5;"}
+{"question": "What teams have multiple players from the same college", "sql": "SELECT team, COUNT(*) AS num_players, COLLEGE FROM nba_roster GROUP BY team, COLLEGE HAVING COUNT(*) > 1;"}
+{"question": "Which college has the most players on the Brooklyn Nets", "sql": "SELECT team, COUNT(*) AS num_players, COLLEGE FROM nba_roster WHERE COLLEGE!= '--' GROUP BY team, COLLEGE ORDER BY num_players DESC;"}
+{"question": "What is the average age of NBA players who are older than 5 years old", "sql": "SELECT AVG(AGE) AS average_age FROM nba_roster WHERE AGE > 5;"}
+{"question": "What are the 10 players with the tallest and shortest heights in the NBA", "sql": "SELECT name, HT, MAX(CAST(SUBSTRING(HT, 1, INSTR(HT,'')-1) AS INTEGER)) AS max_height, MIN(CAST(SUBSTRING(HT, INSTR(HT,'')+1) AS INTEGER)) AS min_height FROM nba_roster WHERE HT!= 'NA' GROUP BY name ORDER BY max_height DESC, min_height ASC LIMIT 10;"}
+{"question": "What is the age of the oldest player on the Toronto Raptors", "sql": "SELECT name, age FROM nba_roster WHERE team='Toronto Raptors' ORDER BY age DESC LIMIT 1;"}
+{"question": "What are the top 5 highest-paid college-educated players in the NBA", "sql": "SELECT NAME, SALARY FROM nba_roster WHERE COLLEGE!= '--' AND SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 5;"}
+{"question": "What is the number of players in the NBA roster who do not have a college listed", "sql": "SELECT COUNT(*) AS num_players FROM nba_roster WHERE COLLEGE = '--';"}
+{"question": "What is the number of players on the Toronto Raptors who earn more than $10,000,000", "sql": "SELECT COUNT(*) FROM nba_roster WHERE team='Toronto Raptors' AND CAST(SUBSTRING(SALARY, 2) AS INTEGER) > 10000000;"}
+{"question": "What is the average height and age of NBA players, and how do these values vary by height", "sql": "SELECT AVG(CAST(SUBSTR(HT, 1, INSTR(HT,'')-1) AS INTEGER)) AS average_height, AVG(AGE) AS average_age FROM nba_roster GROUP BY CAST(SUBSTR(HT, INSTR(HT,'')+1) AS INTEGER);"}
+{"question": "What is the most frequently worn jersey number in the NBA", "sql": "SELECT COUNT(DISTINCT Jersey) AS total_jerseys, Jersey FROM nba_roster GROUP BY Jersey ORDER BY total_jerseys DESC LIMIT 1;"}
+{"question": "What is the average age of all players in the NBA who have a known salary", "sql": "SELECT AVG(AGE) FROM nba_roster WHERE SALARY!= '--';"}
+{"question": "What is the 99th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*99/100-1;"}
+{"question": "What is the 75th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*75/100-1;"}
+{"question": "What is the 25th percentile salary in the NBA?", "sql": "SELECT (CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as percentile FROM nba_roster WHERE SALARY!= '--' order by percentile limit 1 offset (select count(*) from nba_roster where SALARY != '--')*25/100-1;"}
+{"question": "What is the median weight in the NBA?", "sql": "select CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What is the average weight in the NBA?", "sql": "SELECT AVG(CAST(SUBSTR(WT, 1, INSTR(WT,' ')) as INTEGER)) FROM nba_roster;"}
+{"question": "What is the median height in the NBA?", "sql": "select CAST(SUBSTR(HT, 1, INSTR(HT,' ')-1) AS INTEGER)+ CAST(SUBSTR(HT, INSTR(HT,' ')+1) AS FLOAT)/12 as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "Can you tell me how many players are in the NBA?", "sql": "select count(*) from nba_roster;"}
+{"question": "Would you please let me know what the highest paid players are for each position?", "sql": "SELECT name, pos, MAX(CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER)) as max_salary FROM nba_roster WHERE SALARY!= '--' GROUP BY POS;"}
+{"question": "Is Jalen Johnson 23 years old?", "sql": "Select name, age from nba_roster where name='Jalen Johnson';"}
+{"question": "Who is the oldest player on the Brooklyn Nets?", "sql": "SELECT NAME FROM nba_roster WHERE TEAM = 'Brooklyn Nets' AND AGE = (SELECT MAX(AGE) FROM nba_roster WHERE TEAM = 'Brooklyn Nets');"}
+{"question": "Who has the higest salary on the Memphis Grizzlies?", "sql": "select salary, name from nba_roster where team='Memphis Grizzlies' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Which player has the higest salary on the Cleveland Cavaliers?", "sql": "select salary, name from nba_roster where team='Cleveland Cavaliers' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "Who is the highest paid center on the Dallas Mavericks?", "sql": "select salary, name from nba_roster where team='Dallas Mavericks' and POS='C' and SALARY!= '--' ORDER BY CAST(REPLACE(REPLACE(SALARY, '$', ''), ',','') AS INTEGER) DESC LIMIT 1;"}
+{"question": "How much is Marcus Smart getting paid?", "sql": "select salary from nba_roster where name='Marcus Smart';"}
+{"question": "What's the average age of the Trail Blazers?", "sql": "select avg(age) from nba_roster where team='Portland Trail Blazers';"}
+{"question": "What's the median age of the NBA?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster order by percentile limit 1 offset (select count(*) from nba_roster)/2;"}
+{"question": "What's the median age of the Miami Heat?", "sql": "select CAST(AGE as INTEGER) as percentile from nba_roster where team='Miami Heat' order by percentile limit 1 offset (select count(*) from nba_roster where team='Miami Heat')/2;"}

Разница между файлами не показана из-за своего большого размера
+ 226 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2.jsonl


Разница между файлами не показана из-за своего большого размера
+ 1254 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2_large.jsonl


+ 0 - 0
recipes/3p_integrations/lamini/text2sql_memory_tuning/data/training_data/generated_queries_v2_large_filtered.jsonl


Некоторые файлы не были показаны из-за большого количества измененных файлов