Browse Source

initial commit

Radu Boncea 8 months ago
commit
e935566895
59 changed files with 78104 additions and 0 deletions
  1. 228 0
      .gitignore
  2. 21 0
      LICENSE.md
  3. 104 0
      README.md
  4. 0 0
      llmeval/commons/__init__.py
  5. 121 0
      llmeval/commons/admin.py
  6. 210 0
      llmeval/commons/apps.py
  7. 0 0
      llmeval/commons/management/__init__.py
  8. 0 0
      llmeval/commons/management/commands/__init__.py
  9. 47 0
      llmeval/commons/management/commands/clean_pubmedqa_dataset.py
  10. 11 0
      llmeval/commons/management/commands/clean_qa.py
  11. 314 0
      llmeval/commons/management/commands/eval_qa.py
  12. 63 0
      llmeval/commons/management/commands/export_results.py
  13. 54 0
      llmeval/commons/management/commands/import_medqa.py
  14. 58 0
      llmeval/commons/management/commands/import_mmlu.py
  15. 69 0
      llmeval/commons/management/commands/import_pubmedqa.py
  16. 278 0
      llmeval/commons/migrations/0001_initial.py
  17. 19 0
      llmeval/commons/migrations/0002_alter_llmbackend_parameteres_and_more.py
  18. 18 0
      llmeval/commons/migrations/0003_alter_llmbackend_parameteres.py
  19. 18 0
      llmeval/commons/migrations/0004_alter_llmbackend_parameteres.py
  20. 18 0
      llmeval/commons/migrations/0005_alter_llmmodel_parameters.py
  21. 18 0
      llmeval/commons/migrations/0006_alter_evalconfig_description.py
  22. 19 0
      llmeval/commons/migrations/0007_evalsession_name.py
  23. 24 0
      llmeval/commons/migrations/0008_evalconfig_dataset.py
  24. 16 0
      llmeval/commons/migrations/0009_rename_parameteres_llmbackend_parameters.py
  25. 18 0
      llmeval/commons/migrations/0010_llmbackend_client_type.py
  26. 18 0
      llmeval/commons/migrations/0011_evalsession_parameters.py
  27. 54 0
      llmeval/commons/migrations/0012_answerinterpreter_evalsession_answer_interpreter.py
  28. 18 0
      llmeval/commons/migrations/0013_qa_xid.py
  29. 18 0
      llmeval/commons/migrations/0014_alter_qa_options.py
  30. 18 0
      llmeval/commons/migrations/0015_evalconfig_final_answer_pattern.py
  31. 18 0
      llmeval/commons/migrations/0016_qa_context.py
  32. 24 0
      llmeval/commons/migrations/0017_evalsession_request_delay_and_more.py
  33. 23 0
      llmeval/commons/migrations/0018_qa_category_alter_llmbackend_client_type.py
  34. 18 0
      llmeval/commons/migrations/0019_alter_qa_category.py
  35. 18 0
      llmeval/commons/migrations/0020_evalsession_is_active.py
  36. 18 0
      llmeval/commons/migrations/0021_alter_evalsession_is_active.py
  37. 18 0
      llmeval/commons/migrations/0022_evalsession_dataset_target.py
  38. 28 0
      llmeval/commons/migrations/0023_qa_extra_info_alter_evalsession_dataset_target_and_more.py
  39. 0 0
      llmeval/commons/migrations/__init__.py
  40. 233 0
      llmeval/commons/models.py
  41. 3 0
      llmeval/commons/tests.py
  42. 2 0
      llmeval/commons/urls.py
  43. 68 0
      llmeval/commons/views.py
  44. 1272 0
      llmeval/datasets/medqa/4_options/phrases_no_exclude_dev.jsonl
  45. 1273 0
      llmeval/datasets/medqa/4_options/phrases_no_exclude_test.jsonl
  46. 10178 0
      llmeval/datasets/medqa/4_options/phrases_no_exclude_train.jsonl
  47. 14369 0
      llmeval/datasets/medqa/US_qbank.jsonl
  48. 1272 0
      llmeval/datasets/medqa/dev.jsonl
  49. 1273 0
      llmeval/datasets/medqa/test.jsonl
  50. 10178 0
      llmeval/datasets/medqa/train.jsonl
  51. 35173 0
      llmeval/datasets/pubmedqa/ori_pqal.json
  52. 502 0
      llmeval/datasets/pubmedqa/test_ground_truth.json
  53. 0 0
      llmeval/llmeval/__init__.py
  54. 16 0
      llmeval/llmeval/asgi.py
  55. 125 0
      llmeval/llmeval/settings.py
  56. 22 0
      llmeval/llmeval/urls.py
  57. 16 0
      llmeval/llmeval/wsgi.py
  58. 22 0
      llmeval/manage.py
  59. 70 0
      llmeval/requirements.txt

+ 228 - 0
.gitignore

@@ -0,0 +1,228 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,macos,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,visualstudiocode
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/python,macos,visualstudiocode

+ 21 - 0
LICENSE.md

@@ -0,0 +1,21 @@
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+@2024 Radu Boncea, ICI Bucharest

+ 104 - 0
README.md

@@ -0,0 +1,104 @@
+# Overview
+
+This repository hosts a Django application called **LLM Eval**, designed for evaluating and benchmarking large language models against specific datasets. The app integrates clients from leading platforms, including OpenAI, Anthropic, Google, Ollama, and Anyscale.
+
+# Setup
+
+Follow these steps to set up the application on your local machine.
+
+## Clone the repository
+
+Clone the repository to your local machine using the following command:
+
+```bash
+git clone <repository-url>
+```
+
+## Create and activate a Python virtual environment
+Create a virtual environment and activate it:
+
+```bash 
+python -m venv /path/to/environment
+source /path/to/environment/bin/activate  # On Windows, use /path/to/environment/Scripts/activate
+```
+
+## Install Python dependencies
+Install the required Python dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Setup database connection
+The default database is SQLite, but you can configure other types of connections in the configuration file located at llmeval/llmeval/settings.py. For more information on database connections, see the [Django Documentation](https://docs.djangoproject.com/en/5.0/ref/databases/)
+
+## Run the application as a standalone server
+
+1. Instantiate the database 
+
+Run the following command to apply database migrations:
+```bash
+python manage.py migrate
+```
+2. Generate a superuser to acces the app
+
+Create a superuser account to manage the application:
+```bash
+python manage.py createsupersuer
+```
+3. Start the server app 
+
+Start the Server App
+```bash
+python manage.py runserver 8000
+```
+The application will be accesible at http://localhost:8000/admin. Login using the superuser credentials. Once logged in, you will notice this menu items:
+
+1.  **LLM Backends**. These interfaces connect to various LLM model providers. The initial implementation includes support for Ollama, OpenAI, Google, Anthropic, and Anyscale. The parameters attribute contains JSON-serialized arguments passed to the model client. For example, OpenAI requires an `api_key` in the parameters to access its web services.
+2.  **LLM Models**. These models are associated with a backend that serves them. The parameters attribute includes arguments passed to the chat client, such as `top_p`, `top_k`, and `temperature`.
+3.  **Eval Configs** are configurations used to evaluate the performance of LLM models against a dataset. In this section, you will specify the dataset, system prompt, and the regular expression for matching the final answer. Additionally, you can include a chat history, such as few-shot examples, to enhance evaluation accuracy.
+4. **Answer Interpreters** can be used to interpret the models answers by another LLM based assistant.
+5. **Eval Sessions** are testing sessions that are executed by the application. Here is where you choose the Eval Config, the LLM model, the Answer Interpreter (if needed), the dataset target (e.g. train, test, dev, validation) and the delay between requests. The `parameters` attribute overrides the LLM parameters.
+
+# Running an evaluation
+## Load dataset
+At the moment there is an implementation for loading MedQA, PubMedQA and MMLU datasets. 
+```bash
+python manage.py import_medqa  --file=medqa/dataset/test.jsonl --target=test --dataset=medqa #will load the medqa test QA to target test
+```
+```bash
+python manage.py import_mmlu --dataset=mmlu --target=test --subject=anatomy #will load the dataset MMLU covering anatomy subject into a dataset called mmlu and target test. The dataset is loaded from hugging face.
+```
+
+## Execute a session
+```bash
+python manage.py eval_qa --target=test --session-id=16 --continue
+```
+
+## Data Loaders
+The data loaders are locate in `commons/management/commands`. At this moment there are 3 loaders: the `import_medqa` importer, `import_mmlu` and `import_pubmedqa`. MedQA and PubMedQA are imported from local files. MMLU importer pulls the dataset from Huggingface. 
+
+
+# License
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+&copy;2024 Radu Boncea, ICI Bucharest

+ 0 - 0
llmeval/commons/__init__.py


+ 121 - 0
llmeval/commons/admin.py

@@ -0,0 +1,121 @@
+from django.contrib import admin
+from django.utils.safestring import mark_safe
+from django.urls import path
+from .views import eval_download_results, config_download_results
+from .models import (
+    QA, Dataset, EvalAnswer,
+    LLMBackend, LLMModel,
+    EvalConfig, RoleMessage, EvalSession,
+    AnswerInterpreter
+    )
+
+
+class QAAdmin(admin.ModelAdmin):
+    list_display = ('question', 'xid', 'dataset','category', 'correct_answer', 'target')
+    list_filter = ('target', 'dataset', 'category')
+    search_fields = ('question', 'correct_answer', 'category', 'extra_info')
+
+    fieldsets = (
+        (None, {'fields': ('dataset', 'question', 'category', 'extra_info', 'context', 'options', 'correct_answer', 'correct_answer_idx', 'target')}),
+    )
+
+    readonly_fields = ('hash',)
+admin.site.register(QA, QAAdmin)
+
+
+class DatasetAdmin(admin.ModelAdmin):
+    list_display = ('name', 'description')
+    search_fields = ('name', 'description')
+    ordering = ('-created_at',)
+
+    fieldsets = (
+        (None, {'fields': ('name', 'description')}),
+    )
+admin.site.register(Dataset, DatasetAdmin)
+
+
+class EvalAnswerAdmin(admin.ModelAdmin):
+    list_display = ('question', 'get_question_id', 'is_correct', 'llm_model', 'eval_session')
+    list_filter = ('is_correct', 'llm_backend', 'llm_model', 'question__dataset', 'eval_session', 'eval_session__config')
+    search_fields = ('question', 'instruction', 'assistant_answer')
+    ordering = ('-created_at','question')
+
+    def get_question_id(self, obj):
+        return obj.question.id
+    get_question_id.short_description = 'Question ID'
+    get_question_id.admin_order_field = 'question__id'
+admin.site.register(EvalAnswer, EvalAnswerAdmin)
+
+
+class LLMBackendAdmin(admin.ModelAdmin):
+    list_display = ('name',)
+    search_fields = ('name',)
+admin.site.register(LLMBackend, LLMBackendAdmin)
+
+
+class LLMModelAdmin(admin.ModelAdmin):
+    list_display = ('name', 'backend')
+    list_filter = ('backend',)
+    search_fields = ('name',)
+admin.site.register(LLMModel, LLMModelAdmin)
+
+
+class RoleMessageAdmin(admin.ModelAdmin):
+    list_display = ('role', 'eval_config')
+    list_filter = ('role', 'eval_config')
+    search_fields = ('role', 'content')
+admin.site.register(RoleMessage, RoleMessageAdmin)
+
+
+class RoleMessageInline(admin.TabularInline):
+    model = RoleMessage
+    extra = 3
+
+class EvalConfigAdmin(admin.ModelAdmin):
+    list_display = ('name', 'dataset', 'created_at', 'link')
+    search_fields = ('name', 'description')
+    ordering = ('-created_at',)
+    inlines = [RoleMessageInline]
+
+    @admin.display(description='Link')
+    def link(self, obj):
+        return mark_safe(f'<a href="/admin/commons/evalconfig/{obj.id}/config_download_results/">Download Results</a>')
+    
+    def get_urls(self):
+        urls = super().get_urls()
+        my_urls = [path('<int:config_id>/config_download_results/', config_download_results, name='config_download_results'),]
+        return my_urls + urls
+    
+admin.site.register(EvalConfig, EvalConfigAdmin)
+
+class EvalSessionAdmin(admin.ModelAdmin):
+    list_display = ('id', 'name','is_active', 'config', 'llm_model', 'progress', 'accuracy', 'link')
+    list_filter = ('is_active','config', 'llm_model')
+    list_display_links = ["name"]
+    search_fields = ('name', 'config', 'llm_model')
+    ordering = ('-created_at',)
+
+    @admin.display(description='Link')
+    def link(self, obj):
+        return mark_safe(f'<a href="/admin/commons/evalsession/{obj.id}/eval_download_results/">Download Results</a>')
+    
+    def get_urls(self):
+        urls = super().get_urls()
+        my_urls = [path('<int:session_id>/eval_download_results/', eval_download_results, name='eval_download_results'),]
+        return my_urls + urls
+    
+    def accuracy(self, obj):
+        return "{:.2%}".format(obj.accuracy)
+    
+    def progress(self, obj):
+        total_counts_answered = obj.evalanswer_set.count()
+        total_counts = QA.objects.filter(dataset=obj.config.dataset).filter(target=obj.dataset_target).count()
+        return "{}/{}".format(total_counts_answered, total_counts)
+
+admin.site.register(EvalSession, EvalSessionAdmin)
+
+class AnswerInterpreterAdmin(admin.ModelAdmin):
+    list_display = ('name', 'llm_model')
+    search_fields = ('name', 'llm_model__name')
+    ordering = ('-created_at',)
+admin.site.register(AnswerInterpreter, AnswerInterpreterAdmin)

+ 210 - 0
llmeval/commons/apps.py

@@ -0,0 +1,210 @@
+import os
+from django.apps import AppConfig
+from django.db import transaction
+from django.db.models.signals import post_migrate
+
+
+def import_llm_backends(sender, **kwargs):
+    backend_model = sender.get_model('LLMBackend')
+
+    if backend_model.objects.count() != 0:
+        return
+
+    backend_model.objects.create(
+        name='OpenAI',
+        parameters={
+            'api_key': 'sk-1234567890abcdef1234567890abcdef',
+        },
+        client_type='openai'
+    )
+    backend_model.objects.create(
+        name='Ollama',
+        parameters={
+            'host': 'http://localhost:11434',
+        },
+        client_type='ollama'
+    )
+    backend_model.objects.create(
+        name='Anyscale',
+        parameters={
+            "api_key": "esecret_12345abcde", 
+            "base_url": "https://api.endpoints.anyscale.com/v1"
+        },
+        client_type='openai'
+    )
+    backend_model.objects.create(
+        name='Anthropic',
+        parameters={
+            "api_key": "sk-12345abcde"
+        },
+        client_type='anthropic'
+    )
+    backend_model.objects.create(
+        name='GoogleGenAI',
+        parameters={
+            "api_key": "sk-12345abcde"
+        },
+        client_type='genai'
+    )
+    backend_model.objects.create(
+        name='LMStudio',
+        parameters={
+            "api_key": "lm-studio",
+            "base_url": "http://localhost:1234/v1"
+
+        },
+        client_type='openai'
+    )
+
+def import_llm_models(sender, **kwargs):
+    backend_model = sender.get_model('LLMBackend')
+    llm_model = sender.get_model('LLMModel')
+
+    if llm_model.objects.count() != 0:
+        return
+
+    openai_backend = backend_model.objects.get(name='OpenAI')
+    llm_model.objects.create(
+        name='gpt-4o',
+        backend=openai_backend,
+    )
+    llm_model.objects.get_or_create(
+        name='gpt-4o-mini',
+        backend=openai_backend,
+    )
+    
+    ollama_backend = backend_model.objects.get(name='Ollama')
+    llm_model.objects.get_or_create(
+        name='llama3',
+        backend=ollama_backend,
+    )
+
+    anhropic_backend = backend_model.objects.get(name='Anthropic')
+    llm_model.objects.get_or_create(
+        name='claude-3-5-sonnet-20240620',
+        backend=anhropic_backend,
+    )
+
+    anyscale_backend = backend_model.objects.get(name='Anyscale')
+    llm_model.objects.get_or_create(
+        name='meta-llama/Meta-Llama-3-70B-Instruct',
+        backend=anyscale_backend,
+    )
+
+def import_datasets(sender, **kwargs):
+    dataset_model = sender.get_model('Dataset')
+
+    if dataset_model.objects.count() != 0:
+        return
+
+    dataset_model.objects.create(
+        name='medqa',
+        description='MedQA test dataset - 1273 questions',
+    )
+    
+    os.system('python manage.py import_medqa --file datasets//medqa/test.jsonl --dataset medqa --target test')
+
+    dataset_model.objects.create(
+        name='pubmedqa',
+        description='PubMedQA test[expert] dataset - 1000 questions',
+    )
+    os.system('python manage.py import_pubmedqa --file datasets//pubmedqa/ori_pqal.json --file-test-ground-truth=datasets/pubmedqa/test_ground_truth.json --dataset pubmedqa --target test')
+
+    # dataset_model.objects.create(
+    #     name='mmlu',
+    #     description='MMLU test dataset - all 14012 questions',
+    # )
+    # os.system('python manage.py import_mmlu --subject all --dataset mmlu --target test')
+
+
+def import_configs(sender, **kwargs):
+    config_model = sender.get_model('EvalConfig')
+    dataset = sender.get_model('Dataset')
+
+    # get the medqa dataset
+    medqa_dataset = dataset.objects.get(name='medqa')
+
+    if config_model.objects.count() != 0:
+        return
+    
+    with transaction.atomic():
+        config_model.objects.create(
+            name='MedQA - zero shot CoT',
+            description='MedQA test dataset evaluation',
+            dataset=medqa_dataset,
+            sys_prompt="""
+You are a helpful assistant designed to answer questions. 
+Provide concise and accurate responses, ensuring you follow the provided instructions thoroughly.""".strip(),
+            final_answer_pattern = '(?i)Final Answer\s*:\s*([A-E])'
+        )
+
+        # add role messages
+        role_message_model = sender.get_model('RoleMessage')
+        config = config_model.objects.get(name='MedQA - zero shot CoT')
+        role_message_model.objects.create(
+            role='user',
+            eval_config=config,
+            content = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Final Answer: $LETTER' (without quotes) 
+where LETTER is the index of the correct answers from given choices. Think step by step before answering.""".strip()
+        )
+
+def import_sessions(sender, **kwargs):
+    session_model = sender.get_model('EvalSession')
+    config = sender.get_model('EvalConfig')
+    llm_model = sender.get_model('LLMModel')
+
+    if session_model.objects.count() != 0:
+        return
+
+    medqa_config = config.objects.get(name='MedQA - zero shot CoT')
+    gpt4o = llm_model.objects.get(name='gpt-4o')
+    session_model.objects.create(
+        name='MedQA - gpt-4o',
+        config=medqa_config,
+        llm_model=gpt4o,
+        dataset_target='test'
+    )
+
+def answer_interpreters(sender, **kwargs):
+    answer_interpreter_model = sender.get_model('AnswerInterpreter')
+
+    if answer_interpreter_model.objects.count() != 0:
+        return
+    
+    llm_model = sender.get_model('LLMModel')
+    gpt4o_mini = llm_model.objects.get(name='gpt-4o-mini')
+
+    answer_interpreter_model.objects.create(
+        name='YesNoMaybe Interpreter',
+        prompt="""
+You are a helpful assistant who will respond concisely to user requests. Your task is to examine the answer given by another virtual assistant and classify it as 'yes', 'no' or 'maybe'. Your final line of response should be 'Final Answer: yes | no | maybe' (without quotes).
+Here is the assistant's answer:
+"$QUESTION"
+The final answer:""",
+        llm_model=gpt4o_mini,
+    )
+
+    answer_interpreter_model.objects.create(
+        name='MultiChoice Interpreter',
+        prompt="""
+You are a helpful assistant who will respond concisely to user requests. Your task is to examine the answer given by another virtual assistant and extract the letter corresponding to the selected option from multiple choices. Your final line of response should be "Final Answer: $LETTER" where $LETTER is the letter of the answer chosen by the assistant.
+Here is the assistant's answer:
+"$QUESTION"
+The final answer:""",
+        llm_model=gpt4o_mini,
+    )
+
+class CommonsConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'commons'
+
+
+    def ready(self):
+        post_migrate.connect(import_llm_backends, sender=self)
+        post_migrate.connect(import_llm_models, sender=self)
+        post_migrate.connect(import_datasets, sender=self)
+        post_migrate.connect(import_configs, sender=self)
+        post_migrate.connect(import_sessions, sender=self)
+        post_migrate.connect(answer_interpreters, sender=self)
+    

+ 0 - 0
llmeval/commons/management/__init__.py


+ 0 - 0
llmeval/commons/management/commands/__init__.py


+ 47 - 0
llmeval/commons/management/commands/clean_pubmedqa_dataset.py

@@ -0,0 +1,47 @@
+import json
+import os
+from tqdm import tqdm
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset
+
+class Command(BaseCommand):
+    help = 'Clean PubMedQA dataset'
+
+    def handle(self, *args, **options):
+        dataset = Dataset.objects.filter(name='pubmedqa').first()
+        if not dataset:
+            raise CommandError('PubMedQA dataset not found')
+        
+        file_path = 'datasets/pubmedqa/test_ground_truth.json'
+        if not os.path.isfile(file_path):
+            raise CommandError('Invalid file path')
+        
+        test_xids = []
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+            for xid, _ in tqdm(data.items(), desc="Cleaning PubMedQA data"):
+                test_xids.append(xid)
+
+        # get all questions from the dataset with xid no in test_xids
+        questions = QA.objects.filter(dataset=dataset).exclude(xid__in=test_xids)
+        for question in questions:
+            question.delete()
+
+        file_path = 'datasets/pubmedqa/ori_pqal.json'
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+            for xid in test_xids:
+                entry = data[xid]
+                # get qa with xid
+                qa = QA.objects.filter(dataset=dataset, xid=xid).first()
+                if not qa:
+                    continue
+                extra_info = {
+                    'reasoning_required_pred': entry['reasoning_required_pred'],
+                    'reasoning_free_pred': entry['reasoning_free_pred']
+                }
+                qa.extra_info = extra_info
+                qa.save()
+
+
+        self.stdout.write(self.style.SUCCESS('Successfully cleaned PubMedQA dataset'))

+ 11 - 0
llmeval/commons/management/commands/clean_qa.py

@@ -0,0 +1,11 @@
+import json
+from tqdm import tqdm
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset
+
+class Command(BaseCommand):
+    help = 'Delete MedQA data'
+
+    def handle(self, *args, **options):
+        QA.objects.all().delete()
+        self.stdout.write(self.style.SUCCESS('Successfully deleted MedQA data'))

+ 314 - 0
llmeval/commons/management/commands/eval_qa.py

@@ -0,0 +1,314 @@
+import json
+import re
+import os
+import sys
+import time
+import ollama
+import dotenv
+from openai import OpenAI
+import google.generativeai as genai
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
+import anthropic
+from tqdm import tqdm
+from abc import ABC, abstractmethod
+from django.core.management.base import (
+    BaseCommand, 
+    CommandError, 
+    CommandParser
+)
+from commons.models import (
+    QA, Dataset, EvalAnswer,
+    EvalConfig, RoleMessage, EvalSession
+)
+
+
+dotenv.load_dotenv()
+
+
+class LLMClient(ABC):
+    def __init__(self, model, session=None, **kwargs):
+        self.messages = []
+        self.model = model
+        self.session = session
+        self.final_answer_pattern = None
+        self.stats = {
+            "instruction": "",
+            "answer": ""
+        }
+
+        if session and isinstance(session, EvalSession):
+            self.final_answer_pattern = re.compile(session.config.final_answer_pattern)
+            self.messages.append({
+                "role": "system",
+                "content": self.session.config.sys_prompt
+            })
+
+            for role_message in  RoleMessage.objects.filter(eval_config=session.config):
+                self.messages.append({
+                    "role": role_message.role,
+                    "content": role_message.content
+                })
+
+    def make_messages(self, question, options, context):
+        messages = self.messages.copy()
+        option_str = self.option_str(options)
+
+        content = f"Question: {question}"
+        if option_str:
+            content = f"{content}\nChoices:\n {option_str}"
+        if context:
+            content = f"{content}\nContext:\n{context}"
+
+        messages.append(
+            {"role": "user", 
+             "content": content
+             })
+        return messages
+    
+    def get_chat_params(self):
+        model_parameters = self.session.llm_model.parameters
+        if self.session.parameters:
+            model_parameters.update(self.session.parameters)
+        return model_parameters
+
+    @abstractmethod
+    def send_question(self, question, options, context):
+        pass
+        
+    def llm_eval(self, q):
+        question = q.question
+        options = q.options
+        context = q.context
+        correct_answer_idx = q.correct_answer_idx
+        result = self.send_question(question, options, context)
+        self.stats['answer'] = result
+        match = re.search(self.final_answer_pattern, result)
+        if not match:
+            if self.session.answer_interpreter:
+                interpreter_client = get_client(self.session.answer_interpreter.llm_model)
+                question = self.session.answer_interpreter.prompt.replace("$QUESTION", self.stats['answer'])
+                result = interpreter_client.send_question(question)
+                self.stats['answer'] += f"\nInterpreter: {result}"
+                match = re.search(self.final_answer_pattern, result)
+        if match:
+            final_answer = match.group(1)
+            if final_answer.upper() == correct_answer_idx.upper():
+                return True
+        return False
+
+    def messages_2_instruction(self, messages):
+        instruction = ""
+        for message in messages:
+            instruction += f"{message['role'].upper()}: {message['content']}\n\n"
+        return instruction
+    
+    def option_str(self, options=[]):
+        options_str = ""
+        if not options:
+            return None
+        for i, option in options.items():
+            options_str += f"{i}) {option}\n"
+        return options_str
+
+class GoogleGenAI(LLMClient):
+    def __init__(self, model, session, **kwargs):
+        super().__init__(model, session, **kwargs)
+        if 'api_key' not in kwargs:
+            raise CommandError('Google Gen AI API key not found')
+        genai.configure(api_key=kwargs['api_key'])
+        self.client = genai.GenerativeModel(
+            model_name = model, 
+            system_instruction=self.session.config.sys_prompt
+            )
+
+    def send_question(self, question, options=[], context=None):
+        messages = self.make_messages(question, options, context)
+        self.stats['instruction'] = self.messages_2_instruction(messages)
+
+        prompt = messages[-1]['content']
+        messages = messages[:-1]
+
+        history = []
+        for message in messages:
+            if message['role'] == 'system':
+                continue
+            if message['role'] == 'assistant':
+                history.append({
+                    "role": "model",
+                    "parts": message['content']
+                })
+                continue
+            history.append({
+                "role": message['role'],
+                "parts": message['content']
+            })
+        chat = self.client.start_chat(history=history)
+        response = chat.send_message(prompt, safety_settings={
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        })
+
+        return response.text
+    
+class OpenAIClient(LLMClient):
+    def __init__(self, model, session, **kwargs):
+        super().__init__(model, session, **kwargs)
+        self.client = OpenAI(**kwargs)
+
+    def send_question(self, question, options=[], context=None):
+        messages = self.make_messages(question, options, context)
+        self.stats['instruction'] = self.messages_2_instruction(messages)
+
+        model_parameters = self.get_chat_params()
+
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            **model_parameters
+        )
+        msg_content = completion.choices[0].message.content
+        return msg_content
+    
+class AnthropicClient(LLMClient):
+    def __init__(self, model, session, **kwargs):
+        super().__init__(model, session, **kwargs)
+        if 'api_key' not in kwargs:
+            raise CommandError('Anthropic API key not found')
+        self.client = anthropic.Anthropic(api_key=kwargs['api_key'])
+
+    def send_question(self, question, options=[], context=None):
+        messages = self.make_messages(question, options, context)
+        self.stats['instruction'] = self.messages_2_instruction(messages)
+
+        sys_prompt = messages[0]['content']
+        messages = messages[1:]
+
+        # iterate the messages and concatenate the successive messages that have the same role
+        new_messages = []
+        for message in messages:
+            if new_messages and new_messages[-1]['role'] == message['role']:
+                new_messages[-1]['content'] += "\n" + message['content']
+            else:
+                new_messages.append(message)
+
+        model_parameters =self.get_chat_params()
+        
+        response = self.client.messages.create(
+            model=self.model,
+            system=sys_prompt,
+            messages=new_messages,
+            **model_parameters
+        )
+        return response.content[0].text
+
+class OllamaClient(LLMClient):
+    def __init__(self, model, session, **kwargs):
+        super().__init__(model, session, **kwargs)
+        self.client = ollama.Client(**kwargs)
+
+    def send_question(self, question, options, context=None):
+        messages = self.make_messages(question, options, context)
+        self.stats['instruction'] = self.messages_2_instruction(messages)
+
+        model_parameters = self.get_chat_params()
+
+        response = self.client.chat(model=self.model, messages=messages, **model_parameters)
+
+        self.stats['answer'] = response['message']['content']
+        return response['message']['content']
+
+def get_client(llm_model, session):
+    llm_backend = llm_model.backend
+    if llm_backend.client_type == 'openai':
+        if not llm_backend.parameters:
+            raise CommandError('OpenAI parameters not found')
+        return OpenAIClient(llm_model.name, session, **llm_backend.parameters)
+    elif llm_backend.client_type == 'ollama':
+        if not llm_backend.parameters:
+            raise CommandError('Ollama parameters not found')
+        return OllamaClient(llm_model.name, session, **llm_backend.parameters)
+    elif llm_backend.client_type == 'genai':
+        if not llm_backend.parameters:
+            raise CommandError('Google GenAI parameters not found')
+        return GoogleGenAI(llm_model.name, session, **llm_backend.parameters)
+    elif llm_backend.client_type == 'anthropic':
+        if not llm_backend.parameters:
+            raise CommandError('Anthropic parameters not found')
+        return AnthropicClient(llm_model.name, session, **llm_backend.parameters)
+
+class Command(BaseCommand):
+    help = 'Evaluate MedQA'
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument('--sample-size', type=int, help='Sample size')
+        parser.add_argument('--randomize', action='store_true', help='Randomize questions')
+        parser.add_argument('--session-id', type=int, help='Session ID')
+        parser.add_argument('--continue', action='store_true', help='Continue from last question')
+
+    def handle(self, *args, **options):
+        sample_size = options['sample_size']
+        randomize = options['randomize']
+        session_id = options['session_id']
+        continue_from_last = options['continue']
+        
+        session = EvalSession.objects.filter(id=session_id).first()
+        if not session:
+            raise CommandError('Session not found')
+        
+        dataset = session.config.dataset
+        target = session.dataset_target
+        llm_model = session.llm_model
+        llm_backend = llm_model.backend
+        
+
+        qa_set = QA.objects.filter(dataset=dataset).order_by('id')
+        
+        if target:
+            qa_set = qa_set.filter(target=target)
+
+        if randomize:
+            qa_set = qa_set.order_by('?')
+
+        if sample_size:
+            qa_set = qa_set[:sample_size]
+
+        if not qa_set.exists():
+            raise CommandError('No questions found')
+        
+        if continue_from_last:
+            for q in qa_set:
+                if EvalAnswer.objects.filter(eval_session=session, question=q).exists():
+                    qa_set = qa_set.exclude(id=q.id)
+
+        print(f"Questions to evaluate: {qa_set.count()}")
+
+        client = get_client(llm_model, session)
+        
+        stats = {
+            "correct": 0,
+            "total": qa_set.count()
+        }
+        for q in tqdm(qa_set, desc="Evaluating dataset"):
+            correct = client.llm_eval(q)
+            eval_answer = EvalAnswer(
+                eval_session=session,
+                question=q,
+                is_correct=correct,
+                instruction=client.stats['instruction'],
+                assistant_answer = client.stats['answer'],
+                hash=q.hash,
+                llm_backend=llm_backend,
+                llm_model=llm_model
+            )
+            if correct:
+                stats["correct"] += 1
+            
+            eval_answer.save()
+            if session.request_delay:
+                time.sleep(session.request_delay)
+
+        print(f"Accuracy: {stats['correct']}/{stats['total']} ({stats['correct']/stats['total']:.2f})")
+
+        self.stdout.write(self.style.SUCCESS('Successfully evaluated MedQA'))

+ 63 - 0
llmeval/commons/management/commands/export_results.py

@@ -0,0 +1,63 @@
+import json
+import os
+import csv
+from tqdm import tqdm
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset, EvalSession, EvalConfig, EvalAnswer
+
+class Command(BaseCommand):
+    help = 'Export benchmark results to CSV file'
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument('--config', type=str, help='EvalConfig ID')
+        parser.add_argument('--exclude-sessions', type=str, help='Session IDs to exclude')
+        parser.add_argument('--file', type=str, help='Path to CSV file')
+
+    def handle(self, *args, **options):
+        config_id = options['config']
+        if not config_id:
+            raise CommandError('EvalConfig ID is required')
+
+        exclude_session_ids = options['exclude_sessions']
+        
+        file_path = options['file']
+        if not file_path:
+            raise CommandError('File path is required')
+
+        # check if EvalConfig exists
+        config = EvalConfig.objects.filter(id=config_id).first()
+        if not config:
+            raise CommandError('EvalConfig not found')
+
+        # get all sessions for this config
+        sessions = EvalSession.objects.filter(config=config).filter(is_active=True)
+        if exclude_session_ids:
+            exclude_session_ids = exclude_session_ids.split(',')
+            sessions = sessions.exclude(id__in=exclude_session_ids)
+
+        # get all questions for the dataset in this config
+        questions = dict()
+        for qa in QA.objects.filter(dataset=config.dataset):
+            questions[qa.id] = qa.question
+
+        models_data = {}
+        for session in sessions:
+            eval_answers = EvalAnswer.objects.filter(eval_session=session).order_by('question__id')
+            data = {}
+            for eval_answer in eval_answers:
+                data[eval_answer.question.id] = 1 if eval_answer.is_correct else 0
+            models_data[session.llm_model.name] = data
+        
+        print("Exporting the following data:")
+        for model_name, data in models_data.items():
+            print(f"{model_name}: {len(data)}")
+
+        with open(file_path, 'w') as f:
+            writer = csv.writer(f)
+            writer.writerow(['Question ID'] + ['Question Text'] + [model_name for model_name in models_data.keys()])
+            for question_id in tqdm(models_data[list(models_data.keys())[0]].keys(), desc="Exporting data"):
+                row = [question_id] + [questions[question_id]] + [data.get(question_id, '') for data in models_data.values()]
+                writer.writerow(row)
+            row = ['Accuracy'] + [''] + [f"{sum(data.values()) / len(data) * 100:.2f}%" for data in models_data.values()]
+            writer.writerow(row)
+        self.stdout.write(self.style.SUCCESS('Successfully imported MedQA data'))

+ 54 - 0
llmeval/commons/management/commands/import_medqa.py

@@ -0,0 +1,54 @@
+import json
+import os
+from tqdm import tqdm
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset
+
+class Command(BaseCommand):
+    help = 'Import MedQA data from JSON file'
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument('--file', type=str, help='Path to JSON file')
+        parser.add_argument('--target', type=str, help='Dataset target (train, test, dev)', default='test')
+        parser.add_argument('--dataset', type=str, help='Dataset name')
+
+    def handle(self, *args, **options):
+        dataset_name = options['dataset']
+        if not dataset_name:
+            raise CommandError('Dataset name is required')
+
+        file_path = options['file']
+        if not os.path.isfile(file_path):
+            raise CommandError('Invalid file path')
+
+        target = options['target']
+        if target not in ['train', 'test', 'dev']:
+            raise CommandError('Invalid target. Must be one of train, test, dev')
+        
+
+        # check if dataset exists
+        dataset, _ = Dataset.objects.get_or_create(name=dataset_name)
+        
+        with open(file_path, 'r') as f:
+            lines = f.readlines()
+            for line in tqdm(lines, desc="Importing MedQA data"):
+                data = json.loads(line)
+                question = data['question']
+                options = data['options']
+                correct_answer = data['answer']
+                correct_answer_idx = data['answer_idx']
+                qa = QA(
+                    dataset=dataset,
+                    question=question, 
+                    options=options, 
+                    correct_answer=correct_answer, 
+                    correct_answer_idx=correct_answer_idx,
+                    target=target
+                    )
+                try:
+                    qa.save()
+                except Exception as e:
+                    self.stdout.write(self.style.ERROR(f'Error importing MedQA data: {e}'))
+                    continue
+                
+        self.stdout.write(self.style.SUCCESS('Successfully imported MedQA data'))

+ 58 - 0
llmeval/commons/management/commands/import_mmlu.py

@@ -0,0 +1,58 @@
+import json
+import os
+from tqdm import tqdm
+from datasets import load_dataset
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset
+
+class Command(BaseCommand):
+    help = 'Import MMLU dataset from Huggingface'
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument('--subject', type=str, help='Subject or category of the dataset. Default is "all"', default='all')
+        parser.add_argument('--target', type=str, help='Dataset target (train, test, dev)', default='test')
+        parser.add_argument('--dataset', type=str, help='Dataset name')
+
+    def handle(self, *args, **options):
+        dataset_name = options['dataset']
+        if not dataset_name:
+            raise CommandError('Dataset name is required')
+
+        subject = options['subject']
+        if not subject:
+            raise CommandError('Subject is required')
+
+        target = options['target']
+        if target not in ['train', 'test', 'dev', 'validation']:
+            raise CommandError('Invalid target. Must be one of train, test, dev')
+        
+        dataset, _ = Dataset.objects.get_or_create(name=dataset_name)
+
+        ds = load_dataset("cais/mmlu", subject)[target]
+
+        for entry in tqdm(ds, desc="Importing MMLU data"):
+            question = entry['question']
+            category = entry['subject']
+            choices = dict()
+            i = 0
+            for choice in entry['choices']:
+                choices[chr(65+i)] = choice
+                i += 1   
+            correct_answer_idx = chr(65 + entry['answer'])
+            correct_answer = choices[correct_answer_idx]
+            qa = QA(
+                dataset=dataset,
+                question=question,
+                category=category,
+                options=choices, 
+                correct_answer=correct_answer, 
+                correct_answer_idx=correct_answer_idx,
+                target=target
+                )
+            try:
+                qa.save()
+            except Exception as e:
+                self.stdout.write(self.style.ERROR(f'Error importing MMLU question "{question}": {e}'))
+                continue
+
+        self.stdout.write(self.style.SUCCESS('Successfully imported PubMedQA data'))

+ 69 - 0
llmeval/commons/management/commands/import_pubmedqa.py

@@ -0,0 +1,69 @@
+import json
+import os
+from tqdm import tqdm
+from django.core.management.base import BaseCommand, CommandError, CommandParser
+from commons.models import QA, Dataset
+
+class Command(BaseCommand):
+    help = 'Import PubMedQA data from JSON file'
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        parser.add_argument('--file', type=str, help='Path to JSON file')
+        parser.add_argument('--file-test-ground-truth', type=str, help='Path to test ground truth JSON file', default=None)
+        parser.add_argument('--target', type=str, help='Dataset target (train, test, dev)', default='test')
+        parser.add_argument('--dataset', type=str, help='Dataset name')
+
+    def handle(self, *args, **options):
+        dataset_name = options['dataset']
+        if not dataset_name:
+            raise CommandError('Dataset name is required')
+
+        file_path = options['file']
+        if not os.path.isfile(file_path):
+            raise CommandError('Invalid file path')
+        
+        test_xids = []
+        if 'file_test_ground_truth' in options and options['file_test_ground_truth'] is not None:
+            with open(options['file_test_ground_truth'], 'r') as f:
+                data = json.load(f)
+                for xid, _ in data.items():
+                    test_xids.append(xid)
+
+        target = options['target']
+        if target not in ['train', 'test', 'dev']:
+            raise CommandError('Invalid target. Must be one of train, test, dev')
+        
+
+        dataset, _ = Dataset.objects.get_or_create(name=dataset_name)
+
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+            
+            for xid, entry in tqdm(data.items(), desc="Importing PubMedQA data"):
+                if test_xids and xid not in test_xids:
+                    continue
+                question = entry['QUESTION']
+                correct_answer = entry['LONG_ANSWER']
+                correct_answer_idx = entry['final_decision']
+                context = "\n".join(entry['CONTEXTS'])
+                extra_info = {
+                    'reasoning_required_pred': entry['reasoning_required_pred'],
+                    'reasoning_free_pred': entry['reasoning_free_pred']
+                }
+                qa = QA(
+                    dataset=dataset,
+                    xid=xid,
+                    question=question, 
+                    context=context,
+                    correct_answer=correct_answer, 
+                    correct_answer_idx=correct_answer_idx,
+                    target=target,
+                    extra_info=extra_info
+                    )
+                try:
+                    qa.save()
+                except Exception as e:
+                    self.stdout.write(self.style.ERROR(f'Error importing PubMedQA data: {e}'))
+                    continue
+
+        self.stdout.write(self.style.SUCCESS('Successfully imported PubMedQA data'))

+ 278 - 0
llmeval/commons/migrations/0001_initial.py

@@ -0,0 +1,278 @@
+# Generated by Django 5.0.6 on 2024-07-09 06:02
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="Dataset",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("name", models.CharField(max_length=100)),
+                ("description", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+            ],
+            options={
+                "verbose_name": "Dataset",
+                "verbose_name_plural": "Datasets",
+                "db_table": "dataset",
+            },
+        ),
+        migrations.CreateModel(
+            name="EvalConfig",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("name", models.CharField(max_length=100)),
+                ("description", models.TextField()),
+                (
+                    "sys_prompt",
+                    models.TextField(
+                        default="You are a helpful assistant designed to answer questions.\nProvide concise and accurate responses, ensuring you follow the provided instructions thoroughly."
+                    ),
+                ),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+            ],
+            options={
+                "verbose_name": "EvalConfig",
+                "verbose_name_plural": "EvalConfig",
+                "db_table": "eval_config",
+            },
+        ),
+        migrations.CreateModel(
+            name="LLMBackend",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("name", models.CharField(max_length=100)),
+                ("parameteres", models.TextField()),
+            ],
+            options={
+                "verbose_name": "LLM Backend",
+                "verbose_name_plural": "LLM Backends",
+                "db_table": "llm_backed",
+            },
+        ),
+        migrations.CreateModel(
+            name="LLMModel",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("name", models.CharField(max_length=100)),
+                ("parameters", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                (
+                    "backend",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.llmbackend",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "LLM Model",
+                "verbose_name_plural": "LLM Models",
+                "db_table": "llm_model",
+            },
+        ),
+        migrations.CreateModel(
+            name="EvalSession",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                (
+                    "config",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.evalconfig",
+                    ),
+                ),
+                (
+                    "llm_model",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.llmmodel",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "EvalSession",
+                "verbose_name_plural": "EvalSession",
+                "db_table": "eval_session",
+            },
+        ),
+        migrations.CreateModel(
+            name="QA",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("question", models.TextField()),
+                ("correct_answer", models.TextField()),
+                (
+                    "target",
+                    models.CharField(
+                        choices=[("test", "test"), ("train", "train"), ("dev", "dev")],
+                        db_index=True,
+                        default="test",
+                        max_length=100,
+                    ),
+                ),
+                ("options", models.JSONField()),
+                ("correct_answer_idx", models.CharField(max_length=100)),
+                ("hash", models.CharField(max_length=100, unique=True)),
+                (
+                    "dataset",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.dataset",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "QA",
+                "verbose_name_plural": "QAs",
+                "db_table": "qa",
+            },
+        ),
+        migrations.CreateModel(
+            name="EvalAnswer",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("instruction", models.TextField()),
+                ("assistant_answer", models.TextField()),
+                ("is_correct", models.BooleanField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("hash", models.CharField(max_length=100)),
+                (
+                    "eval_session",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.evalsession",
+                    ),
+                ),
+                (
+                    "llm_backend",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.llmbackend",
+                    ),
+                ),
+                (
+                    "llm_model",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.llmmodel",
+                    ),
+                ),
+                (
+                    "question",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="commons.qa"
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "EvalAnswer",
+                "verbose_name_plural": "EvalAnswer",
+                "db_table": "eval_answer",
+            },
+        ),
+        migrations.CreateModel(
+            name="RoleMessage",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "role",
+                    models.CharField(
+                        choices=[("assistant", "assistant"), ("user", "user")],
+                        db_index=True,
+                        max_length=100,
+                    ),
+                ),
+                ("content", models.TextField()),
+                (
+                    "eval_config",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.evalconfig",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "RoleMessage",
+                "verbose_name_plural": "RoleMessage",
+                "db_table": "role_message",
+            },
+        ),
+    ]

+ 19 - 0
llmeval/commons/migrations/0002_alter_llmbackend_parameteres_and_more.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-07-09 07:19
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="llmbackend", name="parameteres", field=models.JSONField(),
+        ),
+        migrations.AlterField(
+            model_name="llmmodel", name="parameters", field=models.JSONField(),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0003_alter_llmbackend_parameteres.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 07:28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0002_alter_llmbackend_parameteres_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="llmbackend",
+            name="parameteres",
+            field=models.JSONField(null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0004_alter_llmbackend_parameteres.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 07:28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0003_alter_llmbackend_parameteres"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="llmbackend",
+            name="parameteres",
+            field=models.JSONField(blank=True, null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0005_alter_llmmodel_parameters.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 07:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0004_alter_llmbackend_parameteres"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="llmmodel",
+            name="parameters",
+            field=models.JSONField(blank=True, null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0006_alter_evalconfig_description.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 08:06
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0005_alter_llmmodel_parameters"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="evalconfig",
+            name="description",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]

+ 19 - 0
llmeval/commons/migrations/0007_evalsession_name.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-07-09 09:04
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0006_alter_evalconfig_description"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="evalsession",
+            name="name",
+            field=models.CharField(default="", max_length=100),
+            preserve_default=False,
+        ),
+    ]

+ 24 - 0
llmeval/commons/migrations/0008_evalconfig_dataset.py

@@ -0,0 +1,24 @@
+# Generated by Django 5.0.6 on 2024-07-09 09:21
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0007_evalsession_name"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="evalconfig",
+            name="dataset",
+            field=models.ForeignKey(
+                default=1,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="commons.dataset",
+            ),
+            preserve_default=False,
+        ),
+    ]

+ 16 - 0
llmeval/commons/migrations/0009_rename_parameteres_llmbackend_parameters.py

@@ -0,0 +1,16 @@
+# Generated by Django 5.0.6 on 2024-07-09 09:51
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0008_evalconfig_dataset"),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name="llmbackend", old_name="parameteres", new_name="parameters",
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0010_llmbackend_client_type.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 16:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0009_rename_parameteres_llmbackend_parameters'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='llmbackend',
+            name='client_type',
+            field=models.CharField(choices=[('openai', 'openai'), ('ollama', 'ollama')], db_index=True, default='openai', max_length=100),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0011_evalsession_parameters.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-09 17:13
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0010_llmbackend_client_type'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='evalsession',
+            name='parameters',
+            field=models.JSONField(blank=True, null=True),
+        ),
+    ]

+ 54 - 0
llmeval/commons/migrations/0012_answerinterpreter_evalsession_answer_interpreter.py

@@ -0,0 +1,54 @@
+# Generated by Django 5.0.6 on 2024-07-11 06:35
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("commons", "0011_evalsession_parameters"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="AnswerInterpreter",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("name", models.CharField(max_length=100)),
+                ("prompt", models.TextField()),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                (
+                    "llm_model",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="commons.llmmodel",
+                    ),
+                ),
+            ],
+            options={
+                "verbose_name": "AnswerInterpreter",
+                "verbose_name_plural": "AnswerInterpreter",
+                "db_table": "answer_interpreter",
+            },
+        ),
+        migrations.AddField(
+            model_name="evalsession",
+            name="answer_interpreter",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                to="commons.answerinterpreter",
+            ),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0013_qa_xid.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-29 12:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0012_answerinterpreter_evalsession_answer_interpreter'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='qa',
+            name='xid',
+            field=models.CharField(blank=True, max_length=100, null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0014_alter_qa_options.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-29 12:47
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0013_qa_xid'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='qa',
+            name='options',
+            field=models.JSONField(blank=True, null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0015_evalconfig_final_answer_pattern.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-29 15:12
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0014_alter_qa_options'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='evalconfig',
+            name='final_answer_pattern',
+            field=models.TextField(default='(?i)Final Answer\\s*:\\s*([A-E])'),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0016_qa_context.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-07-29 15:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0015_evalconfig_final_answer_pattern'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='qa',
+            name='context',
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]

+ 24 - 0
llmeval/commons/migrations/0017_evalsession_request_delay_and_more.py

@@ -0,0 +1,24 @@
+# Generated by Django 5.0.6 on 2024-07-30 07:08
+
+import django.core.validators
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0016_qa_context'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='evalsession',
+            name='request_delay',
+            field=models.IntegerField(blank=True, help_text='Request delay in seconds', null=True, validators=[django.core.validators.MinValueValidator(1), django.core.validators.MaxValueValidator(600)]),
+        ),
+        migrations.AlterField(
+            model_name='llmbackend',
+            name='client_type',
+            field=models.CharField(choices=[('openai', 'OpenAI'), ('ollama', 'Ollama'), ('genai', 'Google GenAI')], db_index=True, default='openai', max_length=100),
+        ),
+    ]

+ 23 - 0
llmeval/commons/migrations/0018_qa_category_alter_llmbackend_client_type.py

@@ -0,0 +1,23 @@
+# Generated by Django 5.0.6 on 2024-08-01 06:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0017_evalsession_request_delay_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='qa',
+            name='category',
+            field=models.CharField(blank=True, max_length=256, null=True),
+        ),
+        migrations.AlterField(
+            model_name='llmbackend',
+            name='client_type',
+            field=models.CharField(choices=[('openai', 'OpenAI'), ('ollama', 'Ollama'), ('genai', 'Google GenAI'), ('anthropic', 'Anthropic')], db_index=True, default='openai', max_length=100),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0019_alter_qa_category.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-08-01 07:46
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0018_qa_category_alter_llmbackend_client_type'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='qa',
+            name='category',
+            field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0020_evalsession_is_active.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-08-01 12:18
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0019_alter_qa_category'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='evalsession',
+            name='is_active',
+            field=models.BooleanField(default=True),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0021_alter_evalsession_is_active.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-08-01 12:22
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0020_evalsession_is_active'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='evalsession',
+            name='is_active',
+            field=models.BooleanField(default=False),
+        ),
+    ]

+ 18 - 0
llmeval/commons/migrations/0022_evalsession_dataset_target.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-08-02 06:00
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0021_alter_evalsession_is_active'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='evalsession',
+            name='dataset_target',
+            field=models.CharField(choices=[('test', 'test'), ('train', 'train'), ('dev', 'dev')], db_index=True, default='test', max_length=100),
+        ),
+    ]

+ 28 - 0
llmeval/commons/migrations/0023_qa_extra_info_alter_evalsession_dataset_target_and_more.py

@@ -0,0 +1,28 @@
+# Generated by Django 5.0.6 on 2024-08-06 09:57
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commons', '0022_evalsession_dataset_target'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='qa',
+            name='extra_info',
+            field=models.JSONField(blank=True, null=True),
+        ),
+        migrations.AlterField(
+            model_name='evalsession',
+            name='dataset_target',
+            field=models.CharField(choices=[('test', 'test'), ('train', 'train'), ('dev', 'dev'), ('validation', 'validation')], db_index=True, default='test', max_length=100),
+        ),
+        migrations.AlterField(
+            model_name='qa',
+            name='target',
+            field=models.CharField(choices=[('test', 'test'), ('train', 'train'), ('dev', 'dev'), ('validation', 'validation')], db_index=True, default='test', max_length=100),
+        ),
+    ]

+ 0 - 0
llmeval/commons/migrations/__init__.py


+ 233 - 0
llmeval/commons/models.py

@@ -0,0 +1,233 @@
+import hashlib
+from django.db import models
+from django.core.validators import MinValueValidator, MaxValueValidator
+
+TEMPLATE_SYSTEM_PROMPT = """
+You are a helpful assistant designed to answer questions.
+Provide concise and accurate responses, ensuring you follow the provided instructions thoroughly.
+""".strip()
+
+TARGET_CHOICES = [
+    ('test', 'test'),
+    ('train', 'train'),
+    ('dev', 'dev'),
+    ('validation', 'validation'),
+]
+
+ROLE_CHOICES = [
+    ('assistant', 'assistant'),
+    ('user', 'user'),
+]
+
+CLIENT_CHOICES = [
+    ('openai', 'OpenAI'),
+    ('ollama', 'Ollama'),
+    ('genai', 'Google GenAI'),
+    ('anthropic', 'Anthropic'),
+]
+
+class Dataset(models.Model):
+    class Meta:
+        db_table = 'dataset'
+        verbose_name = 'Dataset'
+        verbose_name_plural = 'Datasets'
+
+    name = models.CharField(max_length=100)
+    description = models.TextField()
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    def __str__(self):
+        return self.name
+
+class QA(models.Model):
+    class Meta:
+        db_table = 'qa'
+        verbose_name = 'QA'
+        verbose_name_plural = 'QAs'
+
+    dataset = models.ForeignKey('Dataset', on_delete=models.CASCADE)
+    question = models.TextField()
+    category = models.CharField(max_length=256, null=True, blank=True, db_index=True)
+    extra_info = models.JSONField(null=True, blank=True)
+    correct_answer = models.TextField()
+    target = models.CharField(
+        max_length=100, 
+        default='test', 
+        choices=TARGET_CHOICES,
+        db_index=True
+        )
+    options = models.JSONField(null=True, blank=True)
+    xid = models.CharField(max_length=100, null=True, blank=True)
+    context = models.TextField(null=True, blank=True)
+    correct_answer_idx = models.CharField(max_length=100)
+
+    hash = models.CharField(max_length=100, unique=True)
+
+    def __str__(self):
+        return self.question[:50] + (self.question[50:] and '...')
+    
+    def get_hash(self):
+        txt = self.dataset.name + self.question
+        if self.correct_answer:
+            txt += self.correct_answer
+        if self.correct_answer_idx:
+            txt += self.correct_answer_idx
+        if self.xid:
+            txt += self.xid
+        if self.target:
+            txt += self.target
+        if self.category:
+            txt += self.category
+        if self.extra_info:
+            txt += ''.join(self.extra_info)
+        if self.context:
+            txt += self.context
+        if self.options:
+            txt += ''.join(self.options)
+        if self.context:
+            txt += self.context
+        return hashlib.md5(txt.encode('utf-8')).hexdigest()
+
+    def save(self, *args, **kwargs):
+        self.hash = self.get_hash()
+        super().save(*args, **kwargs)
+
+class LLMBackend(models.Model):
+    class Meta:
+        db_table = 'llm_backed'
+        verbose_name = 'LLM Backend'
+        verbose_name_plural = 'LLM Backends'
+    
+    name = models.CharField(max_length=100)
+    client_type = models.CharField(
+        max_length=100,
+        default='openai',
+        choices=CLIENT_CHOICES,
+        db_index=True
+    )
+    parameters = models.JSONField(null=True, blank=True)
+
+    def __str__(self):
+        return self.name
+
+class LLMModel(models.Model):
+    class Meta:
+        db_table = 'llm_model'
+        verbose_name = 'LLM Model'
+        verbose_name_plural = 'LLM Models'
+
+    name = models.CharField(max_length=100)
+    backend = models.ForeignKey('LLMBackend', on_delete=models.CASCADE)
+    parameters = models.JSONField(null=True, blank=True)
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    def __str__(self):
+        return f"[{self.id}] {self.name}"
+
+class EvalConfig(models.Model):
+    class Meta:
+        db_table = 'eval_config'
+        verbose_name = 'EvalConfig'
+        verbose_name_plural = 'EvalConfig'
+
+    name = models.CharField(max_length=100)
+    description = models.TextField(null=True, blank=True)
+    dataset = models.ForeignKey('Dataset', on_delete=models.CASCADE)
+    sys_prompt = models.TextField(default=TEMPLATE_SYSTEM_PROMPT)
+    final_answer_pattern = models.TextField(default='(?i)Final Answer\s*:\s*([A-E])')
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    def __str__(self):
+        return self.name
+    
+class RoleMessage(models.Model):
+    class Meta:
+        db_table = 'role_message'
+        verbose_name = 'RoleMessage'
+        verbose_name_plural = 'RoleMessage'
+
+    role = models.CharField(max_length=100, choices=ROLE_CHOICES, db_index=True)
+    content = models.TextField()
+    eval_config = models.ForeignKey('EvalConfig', on_delete=models.CASCADE)
+
+    def __str__(self):
+        return self.content[:50] + (self.content[50:] and '...')
+    
+class EvalSession(models.Model):
+    class Meta:
+        db_table = 'eval_session'
+        verbose_name = 'EvalSession'
+        verbose_name_plural = 'EvalSession'
+    name = models.CharField(max_length=100)
+    config = models.ForeignKey('EvalConfig', on_delete=models.CASCADE)
+    llm_model = models.ForeignKey('LLMModel', on_delete=models.CASCADE)
+    answer_interpreter = models.ForeignKey('AnswerInterpreter', on_delete=models.SET_NULL, null=True, blank=True)
+    parameters = models.JSONField(null=True, blank=True)
+    dataset_target = models.CharField(
+        max_length=100, 
+        default='test', 
+        choices=TARGET_CHOICES,
+        db_index=True
+    )
+    request_delay = models.IntegerField(
+        null=True, blank=True, 
+        validators=[MinValueValidator(1), MaxValueValidator(600)],
+        help_text="Request delay in seconds"
+        )
+    is_active = models.BooleanField(default=False)
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+    
+    def __str__(self):
+        return f"{self.name} [{self.config.name} - {self.llm_model.name}]"
+
+    def save(self, *args, **kwargs):
+        if self.pk is None:
+            self.parameters = self.llm_model.parameters
+        super().save(*args, **kwargs)
+    
+    # accuracy property
+    @property
+    def accuracy(self):
+        total_counts = self.evalanswer_set.count()
+        if total_counts == 0:
+            return 0
+        return self.evalanswer_set.filter(is_correct=True).count() / self.evalanswer_set.count()
+                     
+class EvalAnswer(models.Model):
+    class Meta:
+        db_table = 'eval_answer'
+        verbose_name = 'EvalAnswer'
+        verbose_name_plural = 'EvalAnswer'
+
+    eval_session = models.ForeignKey('EvalSession', on_delete=models.CASCADE)
+    question = models.ForeignKey('QA', on_delete=models.CASCADE)
+    instruction = models.TextField()
+    assistant_answer = models.TextField()
+    is_correct = models.BooleanField()
+    created_at = models.DateTimeField(auto_now_add=True)
+    hash = models.CharField(max_length=100)
+    llm_backend = models.ForeignKey('LLMBackend', on_delete=models.CASCADE)
+    llm_model = models.ForeignKey('LLMModel', on_delete=models.CASCADE)
+
+    def __str__(self):
+        return f"{self.question} - {self.is_correct}"
+    
+
+class AnswerInterpreter(models.Model):
+    class Meta:
+        db_table = 'answer_interpreter'
+        verbose_name = 'AnswerInterpreter'
+        verbose_name_plural = 'AnswerInterpreter'
+
+    name = models.CharField(max_length=100)
+    prompt = models.TextField()
+    llm_model = models.ForeignKey('LLMModel', on_delete=models.CASCADE)
+    created_at = models.DateTimeField(auto_now_add=True)
+    updated_at = models.DateTimeField(auto_now=True)
+
+    def __str__(self):
+        return self.name

+ 3 - 0
llmeval/commons/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 2 - 0
llmeval/commons/urls.py

@@ -0,0 +1,2 @@
+from django.contrib import admin
+from django.urls import path

+ 68 - 0
llmeval/commons/views.py

@@ -0,0 +1,68 @@
+import csv
+
+from django.shortcuts import render
+from django.http import HttpResponse
+
+
+from .models import EvalSession, EvalAnswer, EvalConfig, QA
+
+def eval_download_results(request, session_id):
+    session = EvalSession.objects.get(id=session_id)
+    eval_answers = EvalAnswer.objects.filter(eval_session=session).order_by('question__id')
+    response = HttpResponse(content_type='text/csv')
+
+    writer = csv.writer(response)
+    writer.writerow(['Question ID', 'Question', 'Correct Answer', 'Correct Answer Index', 'Instruction', 'Assistant answer', 'Is Correct'])
+
+    for eval_answer in eval_answers:
+        writer.writerow([
+            eval_answer.question.id, 
+            eval_answer.question.question, 
+            eval_answer.question.correct_answer, 
+            eval_answer.question.correct_answer_idx, 
+            eval_answer.instruction, 
+            eval_answer.assistant_answer, 
+            eval_answer.is_correct])
+
+    response['Content-Disposition'] = f'attachment; filename="{session.name}.csv"'
+    return response
+
+def config_download_results(request, config_id):
+    response = HttpResponse(content_type='text/csv')
+    config = EvalConfig.objects.get(id=config_id)
+    sessions = EvalSession.objects.filter(config=config).filter(is_active=True)
+    
+    extra_info_fields = []
+    questions = dict()
+    for qa in QA.objects.filter(dataset=config.dataset):
+        questions[qa.id] = qa
+        if not extra_info_fields and qa.extra_info:
+            extra_info_fields = qa.extra_info.keys()
+    
+
+    models_data = {}
+    for session in sessions:
+        eval_answers = EvalAnswer.objects.filter(eval_session=session).order_by('question__id')
+        data = {}
+        for eval_answer in eval_answers:
+            data[eval_answer.question.id] = 1 if eval_answer.is_correct else 0
+        models_data[session.llm_model.name] = data
+
+    writer = csv.writer(response)
+    writer.writerow(['Question ID'] + ['Question Text'] +
+                    [x for x in extra_info_fields] +
+                    [model_name for model_name in models_data.keys()])
+    
+    for question_id in models_data[list(models_data.keys())[0]].keys():
+            if questions[question_id].xid:
+                qid = questions[question_id].xid
+            else:
+                qid = question_id
+            row = [qid] + [questions[question_id].question] +\
+            [questions[question_id].extra_info.get(field, '') for field in extra_info_fields] +\
+            [data.get(question_id, '') for data in models_data.values()]
+            writer.writerow(row)
+    row = ['Accuracy'] + [''] + ['' for _ in extra_info_fields] + [f"{sum(data.values()) / len(data) * 100:.2f}%" for data in models_data.values()]
+    writer.writerow(row)
+    response['Content-Disposition'] = f'attachment; filename="{config.name}.csv"'
+    return response

File diff suppressed because it is too large
+ 1272 - 0
llmeval/datasets/medqa/4_options/phrases_no_exclude_dev.jsonl


File diff suppressed because it is too large
+ 1273 - 0
llmeval/datasets/medqa/4_options/phrases_no_exclude_test.jsonl


File diff suppressed because it is too large
+ 10178 - 0
llmeval/datasets/medqa/4_options/phrases_no_exclude_train.jsonl


File diff suppressed because it is too large
+ 14369 - 0
llmeval/datasets/medqa/US_qbank.jsonl


File diff suppressed because it is too large
+ 1272 - 0
llmeval/datasets/medqa/dev.jsonl


File diff suppressed because it is too large
+ 1273 - 0
llmeval/datasets/medqa/test.jsonl


File diff suppressed because it is too large
+ 10178 - 0
llmeval/datasets/medqa/train.jsonl


File diff suppressed because it is too large
+ 35173 - 0
llmeval/datasets/pubmedqa/ori_pqal.json


+ 502 - 0
llmeval/datasets/pubmedqa/test_ground_truth.json

@@ -0,0 +1,502 @@
+{
+    "12377809": "yes",
+    "26163474": "yes",
+    "19100463": "yes",
+    "18537964": "yes",
+    "12913878": "yes",
+    "12765819": "yes",
+    "25475395": "yes",
+    "19130332": "yes",
+    "9427037": "yes",
+    "24481006": "yes",
+    "8165771": "yes",
+    "22680064": "yes",
+    "22540518": "yes",
+    "20629769": "yes",
+    "21726930": "yes",
+    "21481154": "yes",
+    "22902073": "yes",
+    "26370095": "yes",
+    "18041059": "yes",
+    "15041506": "yes",
+    "11146778": "yes",
+    "27281318": "yes",
+    "21645374": "yes",
+    "9465206": "yes",
+    "25887165": "yes",
+    "15995461": "yes",
+    "21850494": "yes",
+    "19106867": "yes",
+    "21342862": "yes",
+    "24352924": "yes",
+    "16147837": "yes",
+    "26879871": "yes",
+    "15918864": "yes",
+    "22075911": "yes",
+    "11035130": "yes",
+    "21228436": "yes",
+    "11833948": "yes",
+    "17682349": "yes",
+    "17355582": "yes",
+    "15597845": "yes",
+    "10158597": "yes",
+    "27549226": "yes",
+    "26348845": "yes",
+    "25588461": "yes",
+    "23359100": "yes",
+    "26548832": "yes",
+    "25756710": "yes",
+    "20297950": "yes",
+    "24622801": "yes",
+    "9722752": "yes",
+    "20577124": "yes",
+    "19027440": "yes",
+    "18239988": "yes",
+    "27858166": "yes",
+    "27050489": "yes",
+    "16266387": "yes",
+    "27287237": "yes",
+    "11079675": "yes",
+    "10331115": "yes",
+    "18594195": "yes",
+    "22497340": "yes",
+    "16769333": "yes",
+    "20571467": "yes",
+    "12094116": "yes",
+    "17276182": "yes",
+    "26419377": "yes",
+    "23810330": "yes",
+    "15151701": "yes",
+    "23736032": "yes",
+    "28143468": "yes",
+    "23495128": "yes",
+    "12121321": "yes",
+    "18570208": "yes",
+    "28707539": "yes",
+    "22117569": "yes",
+    "18783922": "yes",
+    "15528969": "yes",
+    "19482903": "yes",
+    "11977907": "yes",
+    "17306983": "yes",
+    "24318956": "yes",
+    "22266735": "yes",
+    "22453060": "yes",
+    "10401824": "yes",
+    "15208005": "yes",
+    "16713745": "yes",
+    "21864397": "yes",
+    "25810292": "yes",
+    "11943048": "yes",
+    "23347337": "yes",
+    "23992109": "yes",
+    "10922093": "yes",
+    "26601554": "yes",
+    "15489384": "yes",
+    "27818079": "yes",
+    "24340838": "yes",
+    "16971978": "yes",
+    "21689015": "yes",
+    "12846929": "yes",
+    "22694248": "yes",
+    "15488260": "yes",
+    "23690198": "yes",
+    "10173769": "yes",
+    "12098035": "yes",
+    "23448747": "yes",
+    "24359102": "yes",
+    "14697414": "yes",
+    "15050326": "yes",
+    "27131771": "yes",
+    "26923375": "yes",
+    "15841770": "yes",
+    "25503376": "yes",
+    "19394934": "yes",
+    "22188074": "yes",
+    "21394762": "yes",
+    "9582182": "yes",
+    "28056802": "yes",
+    "18182265": "yes",
+    "9142039": "yes",
+    "20084845": "yes",
+    "24298614": "yes",
+    "12145243": "yes",
+    "21952349": "yes",
+    "27592038": "yes",
+    "25481573": "yes",
+    "20306735": "yes",
+    "26864326": "yes",
+    "21123461": "yes",
+    "12963175": "yes",
+    "10548670": "yes",
+    "21848798": "yes",
+    "25675614": "yes",
+    "25986020": "yes",
+    "18472368": "yes",
+    "26578404": "yes",
+    "14872327": "yes",
+    "23412195": "yes",
+    "24139705": "yes",
+    "23224030": "yes",
+    "24013712": "yes",
+    "15943725": "yes",
+    "27456836": "yes",
+    "24671913": "yes",
+    "22825590": "yes",
+    "23361217": "yes",
+    "18307476": "yes",
+    "22237146": "yes",
+    "25043083": "yes",
+    "26561905": "yes",
+    "23517744": "yes",
+    "27136599": "yes",
+    "10749257": "yes",
+    "17598882": "yes",
+    "15223779": "yes",
+    "16776337": "yes",
+    "23916653": "yes",
+    "10201555": "yes",
+    "24751724": "yes",
+    "8910148": "yes",
+    "18065862": "yes",
+    "22617083": "yes",
+    "25499207": "yes",
+    "16465002": "yes",
+    "25940336": "yes",
+    "24191126": "yes",
+    "8375607": "yes",
+    "26965932": "yes",
+    "22012962": "yes",
+    "12442934": "yes",
+    "19430778": "yes",
+    "20605051": "yes",
+    "19108857": "yes",
+    "24516646": "yes",
+    "25752725": "yes",
+    "20537205": "yes",
+    "20602784": "yes",
+    "22302761": "yes",
+    "18322741": "yes",
+    "14692023": "yes",
+    "22348433": "yes",
+    "26215326": "yes",
+    "23539689": "yes",
+    "9363244": "yes",
+    "24507422": "yes",
+    "22350859": "yes",
+    "19640728": "yes",
+    "23806388": "yes",
+    "9920954": "yes",
+    "8916748": "yes",
+    "11970923": "yes",
+    "19302863": "yes",
+    "22532370": "yes",
+    "18179827": "yes",
+    "18399830": "yes",
+    "12595848": "yes",
+    "18158048": "yes",
+    "23848044": "yes",
+    "11481172": "yes",
+    "28247485": "yes",
+    "24977765": "yes",
+    "14551704": "yes",
+    "12632437": "yes",
+    "20850631": "yes",
+    "17565137": "yes",
+    "9616411": "yes",
+    "22720085": "yes",
+    "21074975": "yes",
+    "25604390": "yes",
+    "14968373": "yes",
+    "10135926": "yes",
+    "19419587": "yes",
+    "23379759": "yes",
+    "19923859": "yes",
+    "22656647": "yes",
+    "12163782": "yes",
+    "21658267": "yes",
+    "9199905": "yes",
+    "23375036": "yes",
+    "24495711": "yes",
+    "26820719": "yes",
+    "26516021": "yes",
+    "20064872": "yes",
+    "15708048": "yes",
+    "29112560": "yes",
+    "23949294": "yes",
+    "10877371": "yes",
+    "23870157": "yes",
+    "18540901": "yes",
+    "21420186": "yes",
+    "12484580": "yes",
+    "23321509": "yes",
+    "26907557": "yes",
+    "22644412": "yes",
+    "25521278": "yes",
+    "21845457": "yes",
+    "18565233": "yes",
+    "17894828": "yes",
+    "10490564": "yes",
+    "7860319": "yes",
+    "18568239": "yes",
+    "9488747": "yes",
+    "20354380": "yes",
+    "24245816": "yes",
+    "11481599": "yes",
+    "27217036": "yes",
+    "23283159": "yes",
+    "19593710": "yes",
+    "18693227": "yes",
+    "21346501": "yes",
+    "17910536": "yes",
+    "26304701": "yes",
+    "18616781": "yes",
+    "9483814": "yes",
+    "12848629": "yes",
+    "25280365": "yes",
+    "25311479": "yes",
+    "16046584": "yes",
+    "26418441": "yes",
+    "22683044": "yes",
+    "26200172": "yes",
+    "20121683": "yes",
+    "18222909": "yes",
+    "12221908": "yes",
+    "24014276": "yes",
+    "24270957": "yes",
+    "18507507": "yes",
+    "16772913": "yes",
+    "12172698": "yes",
+    "26460153": "yes",
+    "12419743": "yes",
+    "25725704": "yes",
+    "25669733": "yes",
+    "24614789": "yes",
+    "24996865": "yes",
+    "18928979": "yes",
+    "25699562": "yes",
+    "24577079": "no",
+    "24669960": "no",
+    "15502995": "no",
+    "21214884": "no",
+    "24476003": "no",
+    "22758782": "no",
+    "14627582": "no",
+    "24666444": "no",
+    "18496363": "no",
+    "12040336": "no",
+    "14631523": "no",
+    "21823940": "no",
+    "17971187": "no",
+    "27642458": "no",
+    "12970636": "no",
+    "11138995": "no",
+    "15388567": "no",
+    "19142546": "no",
+    "8921484": "no",
+    "26209118": "no",
+    "22668852": "no",
+    "18019905": "no",
+    "18378554": "no",
+    "24073931": "no",
+    "7547656": "no",
+    "28359277": "no",
+    "18667100": "no",
+    "10781708": "no",
+    "22522271": "no",
+    "11955750": "no",
+    "26126304": "no",
+    "27338535": "no",
+    "24799031": "no",
+    "18319270": "no",
+    "21789019": "no",
+    "11567820": "no",
+    "10966943": "no",
+    "8199520": "no",
+    "21889895": "no",
+    "26113007": "no",
+    "17208539": "no",
+    "20538207": "no",
+    "9603166": "no",
+    "21194998": "no",
+    "21252642": "no",
+    "16678696": "no",
+    "20549895": "no",
+    "16418930": "no",
+    "8521557": "no",
+    "16809243": "no",
+    "10798511": "no",
+    "10834864": "no",
+    "16962519": "no",
+    "19575104": "no",
+    "24809662": "no",
+    "20602101": "no",
+    "26852225": "no",
+    "19398929": "no",
+    "25614468": "no",
+    "11926574": "no",
+    "10973547": "no",
+    "26471488": "no",
+    "19520213": "no",
+    "23677366": "no",
+    "17342562": "no",
+    "16296668": "no",
+    "17054994": "no",
+    "26556589": "no",
+    "15052394": "no",
+    "22513023": "no",
+    "15919266": "no",
+    "15095519": "no",
+    "12006913": "no",
+    "8738894": "no",
+    "21431987": "no",
+    "22154448": "no",
+    "15053041": "no",
+    "22365295": "no",
+    "19546588": "no",
+    "7482275": "no",
+    "24698298": "no",
+    "18274917": "no",
+    "21946341": "no",
+    "23568387": "no",
+    "21256734": "no",
+    "22534881": "no",
+    "8566975": "no",
+    "23761381": "no",
+    "22668712": "no",
+    "22023714": "no",
+    "22504515": "no",
+    "21164063": "no",
+    "18359123": "no",
+    "16827975": "no",
+    "24922528": "no",
+    "15774570": "no",
+    "20736887": "no",
+    "11483547": "no",
+    "9542484": "no",
+    "18708308": "no",
+    "18435678": "no",
+    "23455575": "no",
+    "22537902": "no",
+    "18926458": "no",
+    "12090319": "no",
+    "12380309": "no",
+    "27989969": "no",
+    "25752912": "no",
+    "26536001": "no",
+    "21849531": "no",
+    "16872243": "no",
+    "23571528": "no",
+    "19481382": "no",
+    "23621776": "no",
+    "22227642": "no",
+    "23025584": "no",
+    "11862129": "no",
+    "22236315": "no",
+    "21361755": "no",
+    "18719011": "no",
+    "11438275": "no",
+    "16778275": "no",
+    "17051586": "no",
+    "24061619": "no",
+    "22233470": "no",
+    "23497210": "no",
+    "25488308": "no",
+    "22382608": "no",
+    "19237087": "no",
+    "10381996": "no",
+    "9100537": "no",
+    "23422012": "no",
+    "22876568": "no",
+    "17445978": "no",
+    "20608141": "no",
+    "23177368": "no",
+    "8847047": "no",
+    "22011946": "no",
+    "27394685": "no",
+    "23794696": "no",
+    "23076787": "no",
+    "19854401": "no",
+    "14652839": "no",
+    "7664228": "no",
+    "17062234": "no",
+    "24449622": "no",
+    "12070552": "no",
+    "19836806": "no",
+    "19913785": "no",
+    "24739448": "no",
+    "24625433": "no",
+    "16403186": "no",
+    "10375486": "no",
+    "17032327": "no",
+    "27643961": "no",
+    "22042121": "no",
+    "17274051": "no",
+    "27096199": "no",
+    "7497757": "no",
+    "21459725": "no",
+    "27040842": "no",
+    "20187289": "no",
+    "21712147": "no",
+    "10456814": "no",
+    "17192736": "no",
+    "27757987": "no",
+    "12769830": "no",
+    "22251324": "no",
+    "28196511": "no",
+    "18284441": "maybe",
+    "18802997": "maybe",
+    "17621202": "maybe",
+    "11411430": "maybe",
+    "26708803": "maybe",
+    "25079920": "maybe",
+    "25793749": "maybe",
+    "19103915": "maybe",
+    "11867487": "maybe",
+    "12630042": "maybe",
+    "19468282": "maybe",
+    "16538201": "maybe",
+    "20971618": "maybe",
+    "24336869": "maybe",
+    "20197761": "maybe",
+    "16968876": "maybe",
+    "26778755": "maybe",
+    "18568290": "maybe",
+    "25371231": "maybe",
+    "25394614": "maybe",
+    "11570976": "maybe",
+    "16816043": "maybe",
+    "12805495": "maybe",
+    "25571931": "maybe",
+    "19578820": "maybe",
+    "18243752": "maybe",
+    "11458136": "maybe",
+    "12790890": "maybe",
+    "18714572": "maybe",
+    "25103647": "maybe",
+    "24995509": "maybe",
+    "27044366": "maybe",
+    "26606599": "maybe",
+    "17076091": "maybe",
+    "26037986": "maybe",
+    "22491528": "maybe",
+    "24591144": "maybe",
+    "19351635": "maybe",
+    "20337202": "maybe",
+    "23149821": "maybe",
+    "18235194": "maybe",
+    "16392897": "maybe",
+    "17940352": "maybe",
+    "27615402": "maybe",
+    "25779009": "maybe",
+    "12407608": "maybe",
+    "14599616": "maybe",
+    "10223070": "maybe",
+    "20736672": "maybe",
+    "25277731": "maybe",
+    "17691856": "maybe",
+    "16735905": "maybe",
+    "19694846": "maybe",
+    "25007420": "maybe",
+    "26134053": "maybe"
+}

+ 0 - 0
llmeval/llmeval/__init__.py


+ 16 - 0
llmeval/llmeval/asgi.py

@@ -0,0 +1,16 @@
+"""
+ASGI config for llmeval project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "llmeval.settings")
+
+application = get_asgi_application()

+ 125 - 0
llmeval/llmeval/settings.py

@@ -0,0 +1,125 @@
+"""
+Django settings for llmeval project.
+
+Generated by 'django-admin startproject' using Django 5.0.6.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/5.0/ref/settings/
+"""
+
+from pathlib import Path
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = "django-insecure-@cpx=q+&a159eujsd@r&gkz0l5t%$0o=)ncvrh_v@y+j9tm_fh"
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = ['*']
+CSRF_TRUSTED_ORIGINS = ['https://*.ngrok-free.app']
+
+# Application definition
+
+INSTALLED_APPS = [
+    "commons",
+    "django.contrib.admin",
+    "django.contrib.auth",
+    "django.contrib.contenttypes",
+    "django.contrib.sessions",
+    "django.contrib.messages",
+    "django.contrib.staticfiles",
+]
+
+MIDDLEWARE = [
+    "django.middleware.security.SecurityMiddleware",
+    "django.contrib.sessions.middleware.SessionMiddleware",
+    "django.middleware.common.CommonMiddleware",
+    "django.middleware.csrf.CsrfViewMiddleware",
+    "django.contrib.auth.middleware.AuthenticationMiddleware",
+    "django.contrib.messages.middleware.MessageMiddleware",
+    "django.middleware.clickjacking.XFrameOptionsMiddleware",
+]
+
+ROOT_URLCONF = "llmeval.urls"
+
+TEMPLATES = [
+    {
+        "BACKEND": "django.template.backends.django.DjangoTemplates",
+        "DIRS": [],
+        "APP_DIRS": True,
+        "OPTIONS": {
+            "context_processors": [
+                "django.template.context_processors.debug",
+                "django.template.context_processors.request",
+                "django.contrib.auth.context_processors.auth",
+                "django.contrib.messages.context_processors.messages",
+            ],
+        },
+    },
+]
+
+WSGI_APPLICATION = "llmeval.wsgi.application"
+
+
+# Database
+# https://docs.djangoproject.com/en/5.0/ref/settings/#databases
+
+DATABASES = {
+    "default": {
+        "ENGINE": "django.db.backends.sqlite3",
+        "NAME": BASE_DIR / "db.sqlite3",
+    },
+    # "production": {
+    #     "ENGINE": "django.db.backends.postgresql",
+    #     "NAME": "llmevals",
+    #     "USER:": "llmevals",
+    #     "PASSWORD": "somepassword",
+    #     "HOST": "pgdbhost",
+    #     "PORT": "5432",
+    # },
+}
+
+# Password validation
+# https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+    {
+        "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
+    },
+    {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",},
+    {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",},
+    {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",},
+]
+
+
+# Internationalization
+# https://docs.djangoproject.com/en/5.0/topics/i18n/
+
+LANGUAGE_CODE = "en-us"
+
+TIME_ZONE = "UTC"
+
+USE_I18N = True
+
+USE_TZ = True
+
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/5.0/howto/static-files/
+
+STATIC_URL = "static/"
+
+# Default primary key field type
+# https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field
+
+DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

+ 22 - 0
llmeval/llmeval/urls.py

@@ -0,0 +1,22 @@
+"""
+URL configuration for llmeval project.
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/5.0/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path
+
+urlpatterns = [
+    path("admin/", admin.site.urls),
+]

+ 16 - 0
llmeval/llmeval/wsgi.py

@@ -0,0 +1,16 @@
+"""
+WSGI config for llmeval project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "llmeval.settings")
+
+application = get_wsgi_application()

+ 22 - 0
llmeval/manage.py

@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+
+
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "llmeval.settings")
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)
+
+
+if __name__ == "__main__":
+    main()

+ 70 - 0
llmeval/requirements.txt

@@ -0,0 +1,70 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.31.2
+anyio==4.4.0
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==23.2.0
+cachetools==5.4.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+datasets==2.20.0
+dill==0.3.8
+distro==1.9.0
+Django==5.0.6
+exceptiongroup==1.2.1
+filelock==3.15.4
+frozenlist==1.4.1
+fsspec==2024.5.0
+google-ai-generativelanguage==0.6.6
+google-api-core==2.19.1
+google-api-python-client==2.138.0
+google-auth==2.32.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.63.2
+grpcio==1.65.1
+grpcio-status==1.62.2
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.22.0
+httpx==0.27.0
+huggingface==0.0.1
+huggingface-hub==0.23.4
+idna==3.7
+jiter==0.5.0
+multidict==6.0.5
+multiprocess==0.70.16
+numpy==2.0.0
+ollama==0.2.1
+openai==1.35.10
+packaging==24.1
+pandas==2.2.2
+proto-plus==1.24.0
+protobuf==4.25.4
+psycopg2==2.9.9
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.32.3
+rsa==4.9
+six==1.16.0
+sniffio==1.3.1
+sqlparse==0.5.0
+tokenizers==0.19.1
+tqdm==4.66.4
+typing_extensions==4.12.2
+tzdata==2024.1
+uritemplate==4.1.1
+urllib3==2.2.2
+xxhash==3.4.1
+yarl==1.9.4