Просмотр исходного кода

fix: leave only Llama4 Maverick model

Young Han 10 месяцев назад
Родитель
Сommit
45a037c38a
1 измененных файлов с 11 добавлено и 24 удалено
  1. 11 24
      end-to-end-use-cases/book-character-mindmap/server/server.py

+ 11 - 24
end-to-end-use-cases/book-character-mindmap/server/server.py

@@ -126,30 +126,17 @@ You are an expert search AI designed to help users find detailed information abo
 Use this format to assist users in finding the relationship information they need.
 Use this format to assist users in finding the relationship information they need.
 """
 """
 
 
-HEAVY_MODEL = True
-
-if HEAVY_MODEL:
-    # LLM_MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-    LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-    llm = LLM(
-        model=LLM_MODEL,
-        enforce_eager=False,
-        tensor_parallel_size=8,
-        max_model_len=500000,
-        override_generation_config={
-            "attn_temperature_tuning": True,
-        },
-    )
-    sampling_params = SamplingParams(temperature=0.5, top_p=0.95, max_tokens=10000)
-else:
-    LLM_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
-    llm = LLM(
-        model=LLM_MODEL,
-        enforce_eager=False,
-        tensor_parallel_size=1,  # Reduce for smaller model
-        max_model_len=4096,  # Smaller context window for efficiency
-    )
-    sampling_params = SamplingParams(temperature=1, top_p=0.95, max_tokens=4096)
+LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+llm = LLM(
+    model=LLM_MODEL,
+    enforce_eager=False,
+    tensor_parallel_size=8,
+    max_model_len=500000,
+    override_generation_config={
+        "attn_temperature_tuning": True,
+    },
+)
+sampling_params = SamplingParams(temperature=0.5, top_p=0.95, max_tokens=10000)
 
 
 
 
 @app.route("/inference", methods=["POST"])
 @app.route("/inference", methods=["POST"])