From 771b9b0ac044edc3fac7dd1171da5e93f2f64206 Mon Sep 17 00:00:00 2001
From: Eric Curtin <eric.curtin@docker.com>
Date: Fri, 27 Feb 2026 16:07:39 +0000
Subject: [PATCH] set ContentLength when forwarding requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HTTP/2 clients (e.g. Java HttpClient with HTTP_2 version) often omit the
Content-Length header since HTTP/2 uses DATA frames for body framing. When
DMR's reverse proxy forwards such requests to the backend via HTTP/1.1, it
uses Transfer-Encoding: chunked (ContentLength == -1), which vLLM's
Python/uvicorn server fails to parse — resulting in an empty body and a 422
Unprocessable Entity response.

Fix by explicitly setting ContentLength = len(body) on the upstream request
after replacing the body with the already-buffered bytes. This ensures a
Content-Length header is always sent, consistent with how the Ollama and
Anthropic handlers already handle this. llama.cpp was unaffected because its
C/C++ HTTP server handles chunked encoding gracefully.

Signed-off-by: Eric Curtin <eric.curtin@docker.com>
---
 pkg/inference/scheduling/http_handler.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
index a9f3077b..d018ad0b 100644
--- a/pkg/inference/scheduling/http_handler.go
+++ b/pkg/inference/scheduling/http_handler.go
@@ -271,8 +271,14 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 	}()
 
 	// Create a request with the body replaced for forwarding upstream.
+	// Set ContentLength explicitly so the backend always receives a Content-Length
+	// header. Without this, HTTP/2 requests (where clients may omit Content-Length)
+	// are forwarded with Transfer-Encoding: chunked, which some backends (e.g.
+	// vLLM's Python/uvicorn server) fail to parse, resulting in an empty body and
+	// a 422 response.
 	upstreamRequest := r.Clone(r.Context())
 	upstreamRequest.Body = io.NopCloser(bytes.NewReader(body))
+	upstreamRequest.ContentLength = int64(len(body))
 
 	// Perform the request.
 	runner.ServeHTTP(w, upstreamRequest)