Explorar o código

fix: leave only Llama4 Maverick model

Young Han hai 7 meses
pai
achega
87728e5e0f
Modificáronse 1 ficheiros con 11 adicións e 24 borrados
  1. 11 24
      end-to-end-use-cases/book-character-mindmap/server/server.py

+ 11 - 24
end-to-end-use-cases/book-character-mindmap/server/server.py

@@ -126,30 +126,17 @@ You are an expert search AI designed to help users find detailed information abo
 Use this format to assist users in finding the relationship information they need.
 Use this format to assist users in finding the relationship information they need.
 """
 """
 
 
-HEAVY_MODEL = True
-
-if HEAVY_MODEL:
-    # LLM_MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-    LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
-    llm = LLM(
-        model=LLM_MODEL,
-        enforce_eager=False,
-        tensor_parallel_size=8,
-        max_model_len=500000,
-        override_generation_config={
-            "attn_temperature_tuning": True,
-        },
-    )
-    sampling_params = SamplingParams(temperature=0.5, top_p=0.95, max_tokens=10000)
-else:
-    LLM_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
-    llm = LLM(
-        model=LLM_MODEL,
-        enforce_eager=False,
-        tensor_parallel_size=1,  # Reduce for smaller model
-        max_model_len=4096,  # Smaller context window for efficiency
-    )
-    sampling_params = SamplingParams(temperature=1, top_p=0.95, max_tokens=4096)
+LLM_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+llm = LLM(
+    model=LLM_MODEL,
+    enforce_eager=False,
+    tensor_parallel_size=8,
+    max_model_len=500000,
+    override_generation_config={
+        "attn_temperature_tuning": True,
+    },
+)
+sampling_params = SamplingParams(temperature=0.5, top_p=0.95, max_tokens=10000)
 
 
 
 
 @app.route("/inference", methods=["POST"])
 @app.route("/inference", methods=["POST"])