Skip to content

Commit

Permalink
fix: enable trust-remote-code in api server & benchmark. (vllm-projec…
Browse files Browse the repository at this point in the history
  • Loading branch information
gesanqiu authored Jul 20, 2023
1 parent cf21a9b commit 8c4b259
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 6 deletions.
3 changes: 3 additions & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def main(args: argparse.Namespace):
tensor_parallel_size=args.tensor_parallel_size,
max_num_seqs=args.batch_size,
max_num_batched_tokens=args.batch_size * args.input_len,
trust_remote_code=args.trust_remote_code,
)

sampling_params = SamplingParams(
Expand Down Expand Up @@ -74,5 +75,7 @@ def run_to_completion(profile: bool = False):
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters', type=int, default=3,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
args = parser.parse_args()
main(args)
4 changes: 3 additions & 1 deletion benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def main(args: argparse.Namespace):
np.random.seed(args.seed)

api_url = f"http://{args.host}:{args.port}/generate"
tokenizer = get_tokenizer(args.tokenizer)
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

benchmark_start_time = time.time()
Expand Down Expand Up @@ -227,5 +227,7 @@ def main(args: argparse.Namespace):
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code', action='store_true',
help='trust remote code from huggingface')
args = parser.parse_args()
main(args)
10 changes: 6 additions & 4 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def run_vllm(
tokenizer=tokenizer,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code
trust_remote_code=trust_remote_code,
)

# Add the requests to the engine.
Expand Down Expand Up @@ -111,7 +111,8 @@ def run_hf(
trust_remote_code: bool,
) -> float:
assert not use_beam_search
llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
llm = AutoModelForCausalLM.from_pretrained(model,
torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token
Expand Down Expand Up @@ -173,8 +174,9 @@ def main(args: argparse.Namespace):
args.seed, args.n, args.use_beam_search, args.trust_remote_code)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.use_beam_search, args.hf_max_batch_size)
elapsed_time = run_hf(
requests, args.model, tokenizer, args.n, args.use_beam_search,
args.hf_max_batch_size, args.trust_remote_code)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(
Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,8 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:

# A separate tokenizer to map token IDs to strings.
tokenizer = get_tokenizer(engine_args.tokenizer,
tokenizer_mode=engine_args.tokenizer_mode)
tokenizer_mode=engine_args.tokenizer_mode,
trust_remote_code=engine_args.trust_remote_code)

uvicorn.run(app,
host=args.host,
Expand Down

0 comments on commit 8c4b259

Please sign in to comment.