Remove QWEN samples from benchmark (#7)

svilupp · Feb 14, 2024 · 909472e · 909472e
1 parent 6c08b5d
commit 909472e
Show file tree

Hide file tree

Showing 8 changed files with 14 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -109,20 +109,12 @@ The best-performing models are in general around 33/34Bn parameters - Phind Code
 | solar:10.7b-instruct-v1-q4_K_M | 18.8 | 17.7 | 35.2 | 50.0 | 31.1 | 107.0 | 10.0 |
 | mistral:7b-instruct-q4_K_M | 13.9 | 13.0 | 34.8 | 50.0 | 26.5 | 80.0 | 0.0 |
 | codellama:70b-instruct-q2_K | 11.2 | 9.4 | 29.8 | 0.0 | 37.7 | 198.0 | 29.0 |
-| qwen:72b-chat-v1.5-q2_K | 8.9 | 8.4 | 27.2 | 0.0 | 37.6 | 211.0 | 45.0 |
-| qwen:72b-chat-v1.5-q4_K_M | 11.4 | 10.4 | 27.0 | 0.0 | 40.4 | 231.0 | 52.0 |
 | llama2 | 17.1 | 16.3 | 26.5 | 25.0 | 26.5 | 131.0 | 0.0 |
 | orca2:13b | 20.1 | 18.3 | 23.1 | 0.0 | 30.6 | 166.0 | 11.0 |
 | stablelm-zephyr | 9.9 | 7.7 | 15.4 | 0.0 | 23.5 | 192.0 | 1.0 |
 | dolphin-phi:2.7b-v2.6-q6_K | 8.9 | 8.4 | 14.9 | 0.0 | 22.9 | 188.0 | 0.0 |
 | codellama:13b-python | 12.5 | 10.7 | 12.8 | 0.0 | 22.1 | 155.0 | 0.0 |
 | phi:2.7b-chat-v2-q6_K | 13.0 | 11.6 | 8.9 | 0.0 | 19.4 | 222.0 | 0.0 |
-| qwen:14b-chat-v1.5-q4_K_M | 3.2 | 2.9 | 8.1 | 0.0 | 21.4 | 299.0 | 6.0 |
-| qwen:14b-chat-v1.5-q6_K | 3.8 | 3.5 | 6.5 | 0.0 | 19.9 | 310.0 | 5.0 |
-| qwen:7b-chat-v1.5-q4_K_M | 2.2 | 2.0 | 3.4 | 0.0 | 14.7 | 329.0 | 1.0 |
-| qwen:4b-chat-v1.5-q6_K | 2.7 | 1.5 | 3.3 | 0.0 | 12.4 | 327.0 | 0.0 |
-| qwen:7b-chat-v1.5-q6_K | 2.7 | 2.5 | 3.0 | 0.0 | 12.3 | 326.0 | 0.0 |
-
 
 Same information, but as a bar chart:
 
@@ -131,6 +123,9 @@ Same information, but as a bar chart:
 And with a separate bar for each prompt template:
 ![Model-Prompt-Scores-for-local-models](assets/model-prompt-comparison-local.png)
 
+> [!NOTE]
+> Qwen-1.5 models have been removed from the overviews as the underlying model on Ollama repository (and HF) is not correct and has very low performance.
+
 > [!NOTE]
 > I have noticed that some evals in Ollama/llama.cpp now score slightly higher now than in Dec-23, so it's on a roadmap to re-run the above evals.
 

diff --git a/assets/elapsed-vs-score-scatter-local.png b/assets/elapsed-vs-score-scatter-local.png
diff --git a/assets/model-comparison-local.png b/assets/model-comparison-local.png
diff --git a/assets/model-prompt-comparison-local.png b/assets/model-prompt-comparison-local.png
diff --git a/examples/compare_paid_vs_local.jl b/examples/compare_paid_vs_local.jl
@@ -39,6 +39,8 @@ PROMPTS = [
 df = @chain begin
  load_evals(DIR_RESULTS; max_history = 5)
  @rsubset :prompt_label in PROMPTS
+ ## remove qwen models as they are not correct!
+ @rsubset !occursin("qwen", :model)
 end;
 
 # ## Comparison by Model

diff --git a/examples/summarize_results_local.jl b/examples/summarize_results_local.jl
@@ -73,6 +73,8 @@ PROMPTS = [
 df = @chain begin
  load_evals(DIR_RESULTS; max_history = 5)
  @rsubset !any(startswith.(:model, PAID_MODELS_DEFAULT)) && :prompt_label in PROMPTS
+ ## remove qwen models as they are not correct!
+ @rsubset !occursin("qwen", :model)
 end;
 
 # ## Model Comparison

diff --git a/examples/summarize_results_prompts.jl b/examples/summarize_results_prompts.jl
@@ -34,7 +34,11 @@ PROMPTS = [
 
 # ## Load Results
 # Use only the 5 most recent evaluations available for each definition/model/prompt
-df = load_evals(DIR_RESULTS; max_history = 5);
+df = @chain begin
+ load_evals(DIR_RESULTS; max_history = 5)
+ ## remove qwen models as they are not correct!
+ @rsubset !occursin("qwen", :model)
+end
 
 # ## Overview of Prompt Templates
 # We've added an "AsIs" prompt template, which is just the raw task definition (nothing added). 

diff --git a/examples/summarize_results_test_cases.jl b/examples/summarize_results_test_cases.jl
@@ -50,6 +50,8 @@ MD("There are currently $(length(fn_definitions)) test cases.") #hide
 
 ## Pre-aggregate winning models
 top_model = @chain df begin
+ ## remove qwen models as they are not correct!
+ @rsubset !occursin("qwen", :model)
  @rsubset !endswith(:model, "--optim")
  @by [:model, :name] :score=mean(:score)
  @rtransform :is_paid = :model in PAID_MODELS_DEFAULT