Fixing wrong token count issue in streaming client

Signed-off-by: “Gangmuk <[email protected]>
vllm-project · Feb 25, 2025 · 3abf67a · 3abf67a
1 parent 1b8805f
commit 3abf67a
Showing 1 changed file with 9 additions and 4 deletions.
diff --git a/benchmarks/client/client.py b/benchmarks/client/client.py
@@ -47,15 +47,18 @@ async def send_request_streaming(client: openai.AsyncOpenAI,
                             first_response_time = asyncio.get_event_loop().time()
                         output_text = chunk.choices[0].delta.content
                         text_chunks.append(output_text)
+                if hasattr(chunk, 'usage') and chunk.usage is not None:
+                    # For OpenAI, we expect to get complete usage stats, not partial ones to accumulate
+                    # So we can safely overwrite previous values if they exist
+                    if chunk.usage.prompt_tokens is not None:
                         prompt_tokens = chunk.usage.prompt_tokens
+                    if chunk.usage.completion_tokens is not None:
                         output_tokens = chunk.usage.completion_tokens
+                    if chunk.usage.total_tokens is not None:
                         total_tokens = chunk.usage.total_tokens
         except Exception as stream_error:
             # Handle errors during streaming
             logging.error(f"Request {request_id}: Stream interrupted: {type(stream_error).__name__}: {str(stream_error)}")
-            # Still try to use what we've received so far
-            if not text_chunks:
-                raise  # Re-raise if we got nothing at all
 
         response_text = "".join(text_chunks)
         response_time = asyncio.get_event_loop().time()
@@ -216,7 +219,9 @@ def main(args):
             base_url=args.endpoint + "/v1",
         )
         if args.routing_strategy is not None:
-            client.default_headers["routing-strategy"] = args.routing_strategy
+            client = client.with_options(
+                default_headers={"routing-strategy": args.routing_strategy}
+            )
         if not args.streaming:
             logging.info("Using batch client")
             start_time = time.time()