Skip to content

Commit

Permalink
URIAL bench for olmo 1.7 and Llama-3
Browse files Browse the repository at this point in the history
  • Loading branch information
yuchenlin committed Apr 19, 2024
1 parent 6a7c794 commit e5e4396
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 4 deletions.
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ Apart from that, URIAL can also be used to study the science of LLMs, helping to

## Installation

```bash
conda create -n urial python=3.10
```bash
conda create -n urial python=3.10
conda activate urial
pip install -r requirements.txt
pip install vllm
# conda create -p /net/nfs/mosaic/yuchenl/envs/urial python=3.10
# conda activate /net/nfs/mosaic/yuchenl/envs/urial
pip install -r requirements.new.txt
```

## URIAL Inference
Expand Down
2 changes: 2 additions & 0 deletions requirements.new.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python-dateutil
openai==0.28.1
14 changes: 13 additions & 1 deletion run_scripts/mt-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ bash run_scripts/mt-bench/gemma-7b-urial.sh $version 0.5 1
bash run_scripts/mt-bench/gemma-2b-urial.sh $version 0.5 1
bash run_scripts/mt-bench/mistral-7b-v2-urial.sh $version 0 1.15
bash run_scripts/mt-bench/dbrx-urial.sh $version 0.5 1

# bash run_scripts/mt-bench/olmo-7b-1.7-urial.sh $version 0 1.15 # vllm version not working now...
bash run_scripts/mt-bench/olmo-1.7-hf-urial.sh $version 0 1.15 # hf version
bash run_scripts/mt-bench/llama3-8b-urial.sh $version 0 1.15
bash run_scripts/mt-bench/llama3-70b-urial.sh $version 0 1.15
```

<details>
Expand Down Expand Up @@ -76,6 +81,9 @@ python run_scripts/mt-bench/formatting_results.py amber ${suffix}
python run_scripts/mt-bench/formatting_results.py Mistral-7b-v0.2 ${suffix}
python run_scripts/mt-bench/formatting_results.py dbrx ${suffix}

python run_scripts/mt-bench/formatting_results.py Llama-3-8B ${suffix}
python run_scripts/mt-bench/formatting_results.py Llama-3-70B ${suffix}
python run_scripts/mt-bench/formatting_results.py olmo-7b-v1.7-hf ${suffix}

# python run_scripts/mt-bench/formatting_results.py olmo ${suffix}
# python run_scripts/mt-bench/formatting_results.py phi-2 ${suffix}
Expand All @@ -90,7 +98,7 @@ suffix="0210v1"
# git clone our modified version of FastChat from [url]
cd /net/nfs/mosaic/yuchenl/FastChat/fastchat/llm_judge/
ls -lht /net/nfs/mosaic/yuchenl/FastChat/fastchat/llm_judge//data/mt_bench/model_answer/ # make sure the model answer is there
# conda activate mb
# conda activate /net/nfs/mosaic/yuchenl/envs/mb/
# python --> /home/yuchenl/.conda/envs/mb/bin/python
python gen_judgment.py --parallel 8 --model-list Llama-2-70b-hf-URIAL-${suffix}
python gen_judgment.py --parallel 8 --model-list Mixtral-8x7B-v0.1-URIAL-${suffix}
Expand All @@ -115,6 +123,10 @@ python gen_judgment.py --parallel 8 --model-list amber-URIAL-${suffix}
python gen_judgment.py --parallel 8 --model-list Mistral-7b-v0.2-URIAL-${suffix}
python gen_judgment.py --parallel 8 --model-list dbrx-URIAL-${suffix}

python gen_judgment.py --parallel 8 --model-list Llama-3-8B-URIAL-${suffix}
python gen_judgment.py --parallel 8 --model-list Llama-3-70B-URIAL-${suffix}
python gen_judgment.py --parallel 8 --model-list olmo-7b-v1.7-hf-URIAL-${suffix}



```
Expand Down
34 changes: 34 additions & 0 deletions run_scripts/mt-bench/llama3-70b-urial.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
version=$1
temp=${2:-0}
rp=${3:-1}
output_dir="result_dirs/mt-bench/urial_bench/"
mkdir -p $output_dir
gpu=0,1,2,3
n=4
model_id="meta-llama/Meta-Llama-3-70B"
pretty_name="Llama-3-70B"
CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name $model_id \
--tensor_parallel_size $n \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 1 \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 1 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn1.json \
--overwrite


CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name $model_id \
--tensor_parallel_size $n \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 2 \
--mt_turn1_result $output_dir/${pretty_name}.turn1.json \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 1 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn2.json \
--overwrite
34 changes: 34 additions & 0 deletions run_scripts/mt-bench/llama3-8b-urial.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
version=$1
temp=${2:-0}
rp=${3:-1}
output_dir="result_dirs/mt-bench/urial_bench/"
mkdir -p $output_dir
gpu=${4:-"0,1,2,3"}
tsp=${5:-4}
model_id="meta-llama/Meta-Llama-3-8B"
pretty_name="Llama-3-8B"
CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name $model_id \
--tensor_parallel_size ${tsp} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 1 \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 4 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn1.json \
--overwrite


CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name $model_id \
--tensor_parallel_size ${tsp} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 2 \
--mt_turn1_result $output_dir/${pretty_name}.turn1.json \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 8 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn2.json \
--overwrite
55 changes: 55 additions & 0 deletions run_scripts/mt-bench/olmo-1.7-hf-urial.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
version=$1
temp=${2:-0}
rp=${3:-1}
output_dir="result_dirs/mt-bench/urial_bench/"
mkdir -p $output_dir
gpus=${4:-"0"}
tsp=1

pretty_name="olmo-7b-v1.7-hf"
model_name="allenai/OLMo-1.7-7B-hf"

n_shards=4
shard_size=20
start_gpu=0
for ((start = 0, end = (($shard_size)), gpu = $start_gpu; gpu < $n_shards+$start_gpu; start += $shard_size, end += $shard_size, gpu++)); do

CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--start_index $start --end_index $end \
--engine hf \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name ${model_name} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 1 \
--no_repeat_ngram_size 3 \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 1 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn1.${start}-${end}.json \
--overwrite &
done
wait
python evaluate/merge_results.py $output_dir/ ${pretty_name}.turn1


n_shards=4
shard_size=20
start_gpu=0
for ((start = 0, end = (($shard_size)), gpu = $start_gpu; gpu < $n_shards+$start_gpu; start += $shard_size, end += $shard_size, gpu++)); do
CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--start_index $start --end_index $end \
--engine hf \
--urial $version \
--download_dir /net/nfs/s2-research/llama2/ \
--model_name ${model_name} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 2 \
--no_repeat_ngram_size 3 \
--mt_turn1_result $output_dir/${pretty_name}.turn1.json \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 1 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn2.${start}-${end}.json \
--overwrite &
done
wait
python evaluate/merge_results.py $output_dir/ ${pretty_name}.turn2
36 changes: 36 additions & 0 deletions run_scripts/mt-bench/olmo-7b-1.7-urial.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
pretty_name="olmo-7b-v1.7-vllm"
model_name="allenai/OLMo-1.7-7B-hf"
version=$1
temp=${2:-0}
rp=${3:-1}
CACHE_DIR="/net/nfs/climate/tmp_cache/"
output_dir="result_dirs/mt-bench/urial_bench/"
mkdir -p $output_dir
gpu=${4:-"0"}
tsp=${5:-1}

CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir $CACHE_DIR \
--model_name ${model_name} \
--tensor_parallel_size ${tsp} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 1 \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 4 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn1.json \
--overwrite


CUDA_VISIBLE_DEVICES=$gpu python src/unified_infer.py \
--urial $version \
--download_dir $CACHE_DIR \
--model_name ${model_name} \
--tensor_parallel_size ${tsp} \
--dtype bfloat16 \
--data_name mt-bench \
--mt_turn 2 \
--mt_turn1_result $output_dir/${pretty_name}.turn1.json \
--top_p 1 --temperature $temp --repetition_penalty $rp --batch_size 8 --max_tokens 2048 \
--filepath $output_dir/${pretty_name}.turn2.json \
--overwrite
3 changes: 3 additions & 0 deletions run_scripts/mt-bench/urial_bench.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@
{"model": "Mixtral-8x7B-v0.1", "Turn 1": 7.69375, "Turn 2": 6.1875, "Overall": 6.940625, "coding": 5.3, "extraction": 7.05, "humanities": 9.2, "math": 4.85, "reasoning": 5.3, "roleplay": 7.4, "stem": 8.225, "writing": 8.2}
{"model": "Mistral-7b-v0.1", "Turn 1": 7.4875, "Turn 2": 5.8625, "Overall": 6.675, "coding": 4.6, "extraction": 7.75, "humanities": 9.075, "math": 3.4, "reasoning": 4.9, "roleplay": 7.65, "stem": 8.275, "writing": 7.75}
{"model": "Yi-34B", "Turn 1": 7.19375, "Turn 2": 6.15625, "Overall": 6.675, "coding": 3.85, "extraction": 6.8, "humanities": 8.475, "math": 4.8, "reasoning": 6.0, "roleplay": 7.75, "stem": 7.825, "writing": 7.9}
{"model": "Llama-3-70B", "Turn 1": 7.7125, "Turn 2": 5.0875, "Overall": 6.4, "coding": 4.35, "extraction": 6.2, "humanities": 8.0, "math": 3.8, "reasoning": 4.95, "roleplay": 7.25, "stem": 8.55, "writing": 8.1}
{"model": "Mistral-7b-v0.2", "Turn 1": 6.9875, "Turn 2": 5.55, "Overall": 6.26875, "coding": 3.8, "extraction": 7.45, "humanities": 8.95, "math": 3.35, "reasoning": 4.5, "roleplay": 6.7, "stem": 7.425, "writing": 7.975}
{"model": "phi-2-vllm", "Turn 1": 7.16875, "Turn 2": 4.936708860759493, "Overall": 6.059748427672956, "coding": 4.55, "extraction": 5.3, "humanities": 8.65, "math": 3.35, "reasoning": 5.5, "roleplay": 6.625, "stem": 7.105263157894737, "writing": 7.45}
{"model": "gemma-7b", "Turn 1": 6.96875, "Turn 2": 5.0375, "Overall": 6.003125, "coding": 3.95, "extraction": 6.25, "humanities": 8.825, "math": 4.35, "reasoning": 4.5, "roleplay": 6.25, "stem": 7.25, "writing": 6.65}
{"model": "phi-2", "Turn 1": 7.0375, "Turn 2": 4.6625, "Overall": 5.85, "coding": 4.25, "extraction": 4.45, "humanities": 8.85, "math": 3.8, "reasoning": 4.55, "roleplay": 7.2, "stem": 7.0, "writing": 6.7}
{"model": "Llama-3-8B", "Turn 1": 6.84375, "Turn 2": 4.65, "Overall": 5.746875, "coding": 4.15, "extraction": 5.25, "humanities": 8.9, "math": 2.6, "reasoning": 3.5, "roleplay": 7.3, "stem": 8.15, "writing": 6.125}
{"model": "Llama-2-13b-hf", "Turn 1": 6.26875, "Turn 2": 4.4125, "Overall": 5.340625, "coding": 2.8, "extraction": 4.7, "humanities": 8.3, "math": 2.85, "reasoning": 2.9, "roleplay": 6.625, "stem": 7.025, "writing": 7.525}
{"model": "Yi-6B", "Turn 1": 5.95625, "Turn 2": 3.9875, "Overall": 4.971875, "coding": 2.3, "extraction": 2.95, "humanities": 8.775, "math": 2.5, "reasoning": 3.5, "roleplay": 6.95, "stem": 7.7, "writing": 5.1}
{"model": "Llama-2-7b-hf", "Turn 1": 5.75, "Turn 2": 3.9125, "Overall": 4.83125, "coding": 1.65, "extraction": 3.4, "humanities": 8.075, "math": 1.6, "reasoning": 3.45, "roleplay": 7.475, "stem": 6.8, "writing": 6.2}
{"model": "gemma-2b", "Turn 1": 5.08125, "Turn 2": 2.8625, "Overall": 3.971875, "coding": 1.8, "extraction": 3.1, "humanities": 5.65, "math": 3.3, "reasoning": 2.55, "roleplay": 5.7, "stem": 5.725, "writing": 3.95}
{"model": "olmo-7b-v1.7-hf", "Turn 1": 4.7375, "Turn 2": 2.875, "Overall": 3.80625, "coding": 1.85, "extraction": 2.8, "humanities": 6.225, "math": 1.4, "reasoning": 3.4, "roleplay": 5.55, "stem": 5.275, "writing": 3.95}
{"model": "olmo", "Turn 1": 3.95, "Turn 2": 2.8625, "Overall": 3.40625, "coding": 1.65, "extraction": 2.45, "humanities": 4.9, "math": 1.25, "reasoning": 2.45, "roleplay": 5.3, "stem": 5.3, "writing": 3.95}
{"model": "olmo-7b-vllm", "Turn 1": 4.61875, "Turn 2": 2.1375, "Overall": 3.378125, "coding": 1.25, "extraction": 2.75, "humanities": 5.4, "math": 1.45, "reasoning": 2.75, "roleplay": 4.55, "stem": 5.475, "writing": 3.4}
{"model": "falcon-7b", "Turn 1": 4.09375, "Turn 2": 2.1, "Overall": 3.096875, "coding": 1.55, "extraction": 2.75, "humanities": 4.4, "math": 1.6, "reasoning": 2.8, "roleplay": 4.55, "stem": 4.425, "writing": 2.7}
Expand Down

0 comments on commit e5e4396

Please sign in to comment.