001/* 002 * Copyright 2022-2025 Revetware LLC. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.soklet; 018 019import com.soklet.internal.spring.LinkedCaseInsensitiveMap; 020 021import javax.annotation.Nonnull; 022import javax.annotation.Nullable; 023import javax.annotation.concurrent.ThreadSafe; 024import java.io.ByteArrayOutputStream; 025import java.lang.Thread.UncaughtExceptionHandler; 026import java.lang.invoke.MethodHandle; 027import java.lang.invoke.MethodHandles; 028import java.lang.invoke.MethodHandles.Lookup; 029import java.lang.invoke.MethodType; 030import java.net.URI; 031import java.net.URISyntaxException; 032import java.net.URLEncoder; 033import java.nio.charset.Charset; 034import java.nio.charset.IllegalCharsetNameException; 035import java.nio.charset.StandardCharsets; 036import java.nio.charset.UnsupportedCharsetException; 037import java.util.ArrayDeque; 038import java.util.ArrayList; 039import java.util.Arrays; 040import java.util.Collections; 041import java.util.Deque; 042import java.util.LinkedHashMap; 043import java.util.LinkedHashSet; 044import java.util.List; 045import java.util.Locale; 046import java.util.Locale.LanguageRange; 047import java.util.Map; 048import java.util.Map.Entry; 049import java.util.Optional; 050import java.util.Set; 051import java.util.concurrent.ExecutorService; 052import java.util.concurrent.Executors; 053import java.util.concurrent.ThreadFactory; 054import java.util.regex.Matcher; 055import java.util.regex.Pattern; 056import java.util.stream.Collectors; 057 058import static java.lang.String.format; 059import static java.util.Objects.requireNonNull; 060 061/** 062 * A non-instantiable collection of utility methods. 063 * 064 * @author <a href="https://www.revetkn.com">Mark Allen</a> 065 */ 066@ThreadSafe 067public final class Utilities { 068 @Nonnull 069 private static final boolean VIRTUAL_THREADS_AVAILABLE; 070 @Nonnull 071 private static final byte[] EMPTY_BYTE_ARRAY; 072 @Nonnull 073 private static final Map<String, Locale> LOCALES_BY_LANGUAGE_RANGE_RANGE; 074 @Nonnull 075 private static final Pattern HEAD_WHITESPACE_PATTERN; 076 @Nonnull 077 private static final Pattern TAIL_WHITESPACE_PATTERN; 078 @Nonnull 079 private static final Pattern HEADER_PERCENT_ENCODING_PATTERN; 080 081 static { 082 EMPTY_BYTE_ARRAY = new byte[0]; 083 084 Locale[] locales = Locale.getAvailableLocales(); 085 Map<String, Locale> localesByLanguageRangeRange = new LinkedHashMap<>(locales.length); 086 087 for (Locale locale : locales) { 088 LanguageRange languageRange = new LanguageRange(locale.toLanguageTag()); 089 localesByLanguageRangeRange.put(languageRange.getRange(), locale); 090 } 091 092 LOCALES_BY_LANGUAGE_RANGE_RANGE = Collections.unmodifiableMap(localesByLanguageRangeRange); 093 094 boolean virtualThreadsAvailable = false; 095 096 try { 097 // Detect if Virtual Threads are usable by feature testing via reflection. 098 // Hat tip to https://github.com/javalin/javalin for this technique 099 Class.forName("java.lang.Thread$Builder$OfVirtual"); 100 virtualThreadsAvailable = true; 101 } catch (Exception ignored) { 102 // We don't care why this failed, but if we're here we know JVM does not support virtual threads 103 } 104 105 VIRTUAL_THREADS_AVAILABLE = virtualThreadsAvailable; 106 107 // See https://www.regular-expressions.info/unicode.html 108 // \p{Z} or \p{Separator}: any kind of whitespace or invisible separator. 109 // 110 // First pattern matches all whitespace at the head of a string, second matches the same for tail. 111 // Useful for a "stronger" trim() function, which is almost always what we want in a web context 112 // with user-supplied input. 113 HEAD_WHITESPACE_PATTERN = Pattern.compile("^(\\p{Z})+"); 114 TAIL_WHITESPACE_PATTERN = Pattern.compile("(\\p{Z})+$"); 115 116 HEADER_PERCENT_ENCODING_PATTERN = Pattern.compile("%([0-9A-Fa-f]{2})"); 117 } 118 119 private Utilities() { 120 // Non-instantiable 121 } 122 123 /** 124 * Does the platform runtime support virtual threads (either Java 19 and 20 w/preview enabled or Java 21+)? 125 * 126 * @return {@code true} if the runtime supports virtual threads, {@code false} otherwise 127 */ 128 @Nonnull 129 static Boolean virtualThreadsAvailable() { 130 return VIRTUAL_THREADS_AVAILABLE; 131 } 132 133 /** 134 * Provides a virtual-thread-per-task executor service if supported by the runtime. 135 * <p> 136 * In order to support Soklet users who are not yet ready to enable virtual threads (those <strong>not</strong> running either Java 19 and 20 w/preview enabled or Java 21+), 137 * we compile Soklet with a source level < 19 and avoid any hard references to virtual threads by dynamically creating our executor service via {@link MethodHandle} references. 138 * <p> 139 * <strong>You should not call this method if {@link Utilities#virtualThreadsAvailable()} is {@code false}.</strong> 140 * <pre>{@code // This method is effectively equivalent to this code 141 * return Executors.newThreadPerTaskExecutor( 142 * Thread.ofVirtual() 143 * .name(threadNamePrefix) 144 * .uncaughtExceptionHandler(uncaughtExceptionHandler) 145 * .factory() 146 * );}</pre> 147 * 148 * @param threadNamePrefix thread name prefix for the virtual thread factory builder 149 * @param uncaughtExceptionHandler uncaught exception handler for the virtual thread factory builder 150 * @return a virtual-thread-per-task executor service 151 * @throws IllegalStateException if the runtime environment does not support virtual threads 152 */ 153 @Nonnull 154 static ExecutorService createVirtualThreadsNewThreadPerTaskExecutor(@Nonnull String threadNamePrefix, 155 @Nonnull UncaughtExceptionHandler uncaughtExceptionHandler) { 156 requireNonNull(threadNamePrefix); 157 requireNonNull(uncaughtExceptionHandler); 158 159 if (!virtualThreadsAvailable()) 160 throw new IllegalStateException("Virtual threads are not available. Please confirm you are using Java 19-20 with the '--enable-preview' javac parameter specified or Java 21+"); 161 162 // Hat tip to https://github.com/javalin/javalin for this technique 163 Class<?> threadBuilderOfVirtualClass; 164 165 try { 166 threadBuilderOfVirtualClass = Class.forName("java.lang.Thread$Builder$OfVirtual"); 167 } catch (ClassNotFoundException e) { 168 throw new IllegalStateException("Unable to load virtual thread builder class", e); 169 } 170 171 Lookup lookup = MethodHandles.publicLookup(); 172 173 MethodHandle methodHandleThreadOfVirtual; 174 MethodHandle methodHandleThreadBuilderOfVirtualName; 175 MethodHandle methodHandleThreadBuilderOfVirtualUncaughtExceptionHandler; 176 MethodHandle methodHandleThreadBuilderOfVirtualFactory; 177 MethodHandle methodHandleExecutorsNewThreadPerTaskExecutor; 178 179 try { 180 methodHandleThreadOfVirtual = lookup.findStatic(Thread.class, "ofVirtual", MethodType.methodType(threadBuilderOfVirtualClass)); 181 methodHandleThreadBuilderOfVirtualName = lookup.findVirtual(threadBuilderOfVirtualClass, "name", MethodType.methodType(threadBuilderOfVirtualClass, String.class, long.class)); 182 methodHandleThreadBuilderOfVirtualUncaughtExceptionHandler = lookup.findVirtual(threadBuilderOfVirtualClass, "uncaughtExceptionHandler", MethodType.methodType(threadBuilderOfVirtualClass, UncaughtExceptionHandler.class)); 183 methodHandleThreadBuilderOfVirtualFactory = lookup.findVirtual(threadBuilderOfVirtualClass, "factory", MethodType.methodType(ThreadFactory.class)); 184 methodHandleExecutorsNewThreadPerTaskExecutor = lookup.findStatic(Executors.class, "newThreadPerTaskExecutor", MethodType.methodType(ExecutorService.class, ThreadFactory.class)); 185 } catch (NoSuchMethodException | IllegalAccessException e) { 186 throw new IllegalStateException("Unable to load method handle for virtual thread factory", e); 187 } 188 189 try { 190 // Thread.ofVirtual() 191 Object virtualThreadBuilder = methodHandleThreadOfVirtual.invoke(); 192 // .name(threadNamePrefix, start) 193 methodHandleThreadBuilderOfVirtualName.invoke(virtualThreadBuilder, threadNamePrefix, 1); 194 // .uncaughtExceptionHandler(uncaughtExceptionHandler) 195 methodHandleThreadBuilderOfVirtualUncaughtExceptionHandler.invoke(virtualThreadBuilder, uncaughtExceptionHandler); 196 // .factory(); 197 ThreadFactory threadFactory = (ThreadFactory) methodHandleThreadBuilderOfVirtualFactory.invoke(virtualThreadBuilder); 198 199 // return Executors.newThreadPerTaskExecutor(threadFactory); 200 return (ExecutorService) methodHandleExecutorsNewThreadPerTaskExecutor.invoke(threadFactory); 201 } catch (Throwable t) { 202 throw new IllegalStateException("Unable to create virtual thread executor service", t); 203 } 204 } 205 206 /** 207 * Returns a shared zero-length {@code byte[]} instance. 208 * <p> 209 * Useful as a sentinel when you need a non-{@code null} byte array but have no content. 210 * 211 * @return a zero-length byte array (never {@code null}) 212 */ 213 @Nonnull 214 static byte[] emptyByteArray() { 215 return EMPTY_BYTE_ARRAY; 216 } 217 218 /** 219 * Parses a query string such as {@code "a=1&b=2&c=%20"} into a multimap of names to values. 220 * <p> 221 * Decodes percent-escapes using UTF-8, which is usually what you want (see {@link #extractQueryParametersFromQuery(String, QueryFormat, Charset)} if you need to specify a different charset). 222 * <p> 223 * Pairs missing a name are ignored. 224 * <p> 225 * Multiple occurrences of the same name are collected into a {@link Set} in insertion order (duplicates are de-duplicated). 226 * 227 * @param query a raw query string such as {@code "a=1&b=2&c=%20"} 228 * @param queryFormat how to decode: {@code application/x-www-form-urlencoded} or "strict" RFC 3986 229 * @return a map of parameter names to their distinct values, preserving first-seen name order; empty if none 230 */ 231 @Nonnull 232 public static Map<String, Set<String>> extractQueryParametersFromQuery(@Nonnull String query, 233 @Nonnull QueryFormat queryFormat) { 234 requireNonNull(query); 235 requireNonNull(queryFormat); 236 237 return extractQueryParametersFromQuery(query, queryFormat, StandardCharsets.UTF_8); 238 } 239 240 /** 241 * Parses a query string such as {@code "a=1&b=2&c=%20"} into a multimap of names to values. 242 * <p> 243 * Decodes percent-escapes using the specified charset. 244 * <p> 245 * Pairs missing a name are ignored. 246 * <p> 247 * Multiple occurrences of the same name are collected into a {@link Set} in insertion order (duplicates are de-duplicated). 248 * 249 * @param query a raw query string such as {@code "a=1&b=2&c=%20"} 250 * @param queryFormat how to decode: {@code application/x-www-form-urlencoded} or "strict" RFC 3986 251 * @param charset the charset to use when decoding percent-escapes 252 * @return a map of parameter names to their distinct values, preserving first-seen name order; empty if none 253 */ 254 @Nonnull 255 public static Map<String, Set<String>> extractQueryParametersFromQuery(@Nonnull String query, 256 @Nonnull QueryFormat queryFormat, 257 @Nonnull Charset charset) { 258 requireNonNull(query); 259 requireNonNull(queryFormat); 260 requireNonNull(charset); 261 262 // For form parameters, body will look like "One=Two&Three=Four" ...a query string. 263 String syntheticUrl = format("https://soklet.invalid?%s", query); // avoid referencing real domain 264 return extractQueryParametersFromUrl(syntheticUrl, queryFormat, charset); 265 } 266 267 /** 268 * Parses query strings from relative or absolute URLs such as {@code "/example?a=a=1&b=2&c=%20"} or {@code "https://www.soklet.com/example?a=1&b=2&c=%20"} into a multimap of names to values. 269 * <p> 270 * Decodes percent-escapes using UTF-8, which is usually what you want (see {@link #extractQueryParametersFromUrl(String, QueryFormat, Charset)} if you need to specify a different charset). 271 * <p> 272 * Pairs missing a name are ignored. 273 * <p> 274 * Multiple occurrences of the same name are collected into a {@link Set} in insertion order (duplicates are de-duplicated). 275 * 276 * @param url a relative or absolute URL/URI string 277 * @param queryFormat how to decode: {@code application/x-www-form-urlencoded} or "strict" RFC 3986 278 * @return a map of parameter names to their distinct values, preserving first-seen name order; empty if none/invalid 279 */ 280 @Nonnull 281 public static Map<String, Set<String>> extractQueryParametersFromUrl(@Nonnull String url, 282 @Nonnull QueryFormat queryFormat) { 283 requireNonNull(url); 284 requireNonNull(queryFormat); 285 286 return extractQueryParametersFromUrl(url, queryFormat, StandardCharsets.UTF_8); 287 } 288 289 /** 290 * Parses query strings from relative or absolute URLs such as {@code "/example?a=a=1&b=2&c=%20"} or {@code "https://www.soklet.com/example?a=1&b=2&c=%20"} into a multimap of names to values. 291 * <p> 292 * Decodes percent-escapes using the specified charset. 293 * <p> 294 * Pairs missing a name are ignored. 295 * <p> 296 * Multiple occurrences of the same name are collected into a {@link Set} in insertion order (duplicates are de-duplicated). 297 * 298 * @param url a relative or absolute URL/URI string 299 * @param queryFormat how to decode: {@code application/x-www-form-urlencoded} or "strict" RFC 3986 300 * @param charset the charset to use when decoding percent-escapes 301 * @return a map of parameter names to their distinct values, preserving first-seen name order; empty if none/invalid 302 */ 303 @Nonnull 304 public static Map<String, Set<String>> extractQueryParametersFromUrl(@Nonnull String url, 305 @Nonnull QueryFormat queryFormat, 306 @Nonnull Charset charset) { 307 requireNonNull(url); 308 requireNonNull(queryFormat); 309 requireNonNull(charset); 310 311 URI uri; 312 313 try { 314 uri = new URI(url); 315 } catch (URISyntaxException e) { 316 return Map.of(); 317 } 318 319 String query = trimAggressivelyToNull(uri.getRawQuery()); 320 321 if (query == null) 322 return Map.of(); 323 324 Map<String, Set<String>> queryParameters = new LinkedHashMap<>(); 325 for (String pair : query.split("&")) { 326 if (pair.isEmpty()) 327 continue; 328 329 String[] nv = pair.split("=", 2); 330 String rawName = trimAggressivelyToNull(nv.length > 0 ? nv[0] : null); 331 String rawValue = trimAggressivelyToNull(nv.length > 1 ? nv[1] : null); 332 333 if (rawName == null) 334 continue; 335 336 // Preserve empty values; it's what users probably expect 337 if (rawValue == null) 338 rawValue = ""; 339 340 String name = decodeQueryComponent(rawName, queryFormat, charset); 341 String value = decodeQueryComponent(rawValue, queryFormat, charset); 342 343 queryParameters.computeIfAbsent(name, k -> new LinkedHashSet<>()).add(value); 344 } 345 346 return queryParameters; 347 } 348 349 /** 350 * Decodes a single key or value using the given mode and charset. 351 */ 352 @Nonnull 353 private static String decodeQueryComponent(@Nonnull String string, 354 @Nonnull QueryFormat queryFormat, 355 @Nonnull Charset charset) { 356 requireNonNull(string); 357 requireNonNull(queryFormat); 358 requireNonNull(charset); 359 360 if (string.isEmpty()) 361 return ""; 362 363 // Step 1: in form mode, '+' means space 364 String prepped = (queryFormat == QueryFormat.X_WWW_FORM_URLENCODED) ? string.replace('+', ' ') : string; 365 // Step 2: percent-decode bytes, then interpret bytes with the provided charset 366 return percentDecode(prepped, charset); 367 } 368 369 /** 370 * Percent-decodes a string into bytes, then constructs a String using the provided charset. 371 * One pass only: invalid %xy sequences are left as literal '%' + chars. 372 */ 373 @Nonnull 374 private static String percentDecode(@Nonnull String string, 375 @Nonnull Charset charset) { 376 requireNonNull(string); 377 requireNonNull(charset); 378 379 if (string.isEmpty()) 380 return ""; 381 382 ByteArrayOutputStream out = new ByteArrayOutputStream(string.length()); 383 for (int i = 0; i < string.length(); i++) { 384 char c = string.charAt(i); 385 if (c == '%' && i + 2 < string.length()) { 386 int hi = hex(string.charAt(i + 1)); 387 int lo = hex(string.charAt(i + 2)); 388 if (hi >= 0 && lo >= 0) { 389 out.write((hi << 4) | lo); 390 i += 2; 391 continue; 392 } 393 // fall through: invalid percent triplet, treat '%' literally 394 } 395 // Write this character's bytes in the given charset (ASCII-fast path is fine too) 396 byte[] bs = String.valueOf(c).getBytes(charset); 397 out.write(bs, 0, bs.length); 398 } 399 400 return new String(out.toByteArray(), charset); 401 } 402 403 private static int hex(char c) { 404 if (c >= '0' && c <= '9') return c - '0'; 405 if (c >= 'A' && c <= 'F') return c - 'A' + 10; 406 if (c >= 'a' && c <= 'f') return c - 'a' + 10; 407 return -1; 408 } 409 410 /** 411 * Parses {@code Cookie} request headers into a map of cookie names to values. 412 * <p> 413 * Header name matching is case-insensitive ({@code "Cookie"} vs {@code "cookie"}), but <em>cookie names are case-sensitive</em>. 414 * Values are parsed per the following liberal rules: 415 * <ul> 416 * <li>Components are split on {@code ';'} unless inside a quoted string.</li> 417 * <li>Quoted values have surrounding quotes removed and common backslash escapes unescaped.</li> 418 * <li>Percent-escapes are decoded as UTF-8. {@code '+'} is <strong>not</strong> treated specially.</li> 419 * </ul> 420 * Multiple occurrences of the same cookie name are collected into a {@link Set} in insertion order. 421 * 422 * @param headers request headers as a multimap of header name to values (must be non-{@code null}) 423 * @return a map of cookie name to distinct values; empty if no valid cookies are present 424 */ 425 @Nonnull 426 public static Map<String, Set<String>> extractCookiesFromHeaders(@Nonnull Map<String, Set<String>> headers) { 427 requireNonNull(headers); 428 429 // Cookie *names* must be case-sensitive; keep LinkedHashMap (NOT case-insensitive) 430 Map<String, Set<String>> cookies = new LinkedHashMap<>(); 431 432 for (Entry<String, Set<String>> entry : headers.entrySet()) { 433 String headerName = entry.getKey(); 434 if (headerName == null || !"cookie".equalsIgnoreCase(headerName.trim())) 435 continue; 436 437 Set<String> values = entry.getValue(); 438 if (values == null) continue; 439 440 for (String headerValue : values) { 441 headerValue = trimAggressivelyToNull(headerValue); 442 if (headerValue == null) continue; 443 444 // Split on ';' only when NOT inside a quoted string 445 List<String> cookieComponents = splitCookieHeaderRespectingQuotes(headerValue); 446 447 for (String cookieComponent : cookieComponents) { 448 cookieComponent = trimAggressivelyToNull(cookieComponent); 449 if (cookieComponent == null) continue; 450 451 String[] cookiePair = cookieComponent.split("=", 2); 452 String rawName = trimAggressivelyToNull(cookiePair[0]); 453 String rawValue = (cookiePair.length == 2 ? trimAggressivelyToNull(cookiePair[1]) : null); 454 455 if (rawName == null) continue; 456 457 // DO NOT decode the name; cookie names are case-sensitive and rarely encoded 458 String cookieName = rawName; 459 460 String cookieValue = null; 461 if (rawValue != null) { 462 // If it's quoted, unquote+unescape first, then percent-decode (still no '+' -> space) 463 String unquoted = unquoteCookieValueIfNeeded(rawValue); 464 cookieValue = percentDecodeCookieValue(unquoted); 465 } 466 467 cookies.computeIfAbsent(cookieName, key -> new LinkedHashSet<>()); 468 if (cookieValue != null) 469 cookies.get(cookieName).add(cookieValue); 470 } 471 } 472 } 473 474 return cookies; 475 } 476 477 /** 478 * Percent-decodes %HH to bytes->UTF-8. Does NOT treat '+' specially. 479 */ 480 @Nonnull 481 private static String percentDecodeCookieValue(@Nonnull String cookieValue) { 482 requireNonNull(cookieValue); 483 484 ByteArrayOutputStream out = new ByteArrayOutputStream(cookieValue.length()); 485 486 for (int i = 0; i < cookieValue.length(); ) { 487 char c = cookieValue.charAt(i); 488 if (c == '%' && i + 2 < cookieValue.length()) { 489 int hi = Character.digit(cookieValue.charAt(i + 1), 16); 490 int lo = Character.digit(cookieValue.charAt(i + 2), 16); 491 if (hi >= 0 && lo >= 0) { 492 out.write((hi << 4) + lo); 493 i += 3; 494 continue; 495 } 496 } 497 498 out.write((byte) c); 499 i++; 500 } 501 502 return out.toString(StandardCharsets.UTF_8); 503 } 504 505 /** 506 * Splits a Cookie header string into components on ';' but ONLY when not inside a quoted value. 507 * Supports backslash-escaped quotes within quoted strings. 508 */ 509 private static List<String> splitCookieHeaderRespectingQuotes(@Nonnull String headerValue) { 510 List<String> parts = new ArrayList<>(); 511 StringBuilder cur = new StringBuilder(headerValue.length()); 512 boolean inQuotes = false; 513 boolean escape = false; 514 515 for (int i = 0; i < headerValue.length(); i++) { 516 char c = headerValue.charAt(i); 517 518 if (escape) { 519 // keep escaped char literally (e.g., \" \; \\) 520 cur.append(c); 521 escape = false; 522 continue; 523 } 524 525 if (c == '\\') { 526 escape = true; 527 // keep the backslash for now; unquote step will handle unescaping 528 cur.append(c); 529 continue; 530 } 531 532 if (c == '"') { 533 inQuotes = !inQuotes; 534 cur.append(c); 535 continue; 536 } 537 538 if (c == ';' && !inQuotes) { 539 parts.add(cur.toString()); 540 cur.setLength(0); 541 continue; 542 } 543 544 cur.append(c); 545 } 546 547 if (cur.length() > 0) 548 parts.add(cur.toString()); 549 550 return parts; 551 } 552 553 /** 554 * If the cookie value is a quoted-string, remove surrounding quotes and unescape \" \\ and \; . 555 * Otherwise returns the input as-is. 556 */ 557 @Nonnull 558 private static String unquoteCookieValueIfNeeded(@Nonnull String rawValue) { 559 requireNonNull(rawValue); 560 561 if (rawValue.length() >= 2 && rawValue.charAt(0) == '"' && rawValue.charAt(rawValue.length() - 1) == '"') { 562 // Strip the surrounding quotes 563 String inner = rawValue.substring(1, rawValue.length() - 1); 564 565 // Unescape \" \\ and \; (common patterns seen in the wild) 566 // Order matters: unescape backslash-escape sequences, then leave other chars intact. 567 StringBuilder sb = new StringBuilder(inner.length()); 568 boolean escape = false; 569 570 for (int i = 0; i < inner.length(); i++) { 571 char c = inner.charAt(i); 572 if (escape) { 573 // Only special-case a few common escapes; otherwise keep the char 574 if (c == '"' || c == '\\' || c == ';') 575 sb.append(c); 576 else 577 sb.append(c); // unknown escape -> keep literally (liberal in what we accept) 578 579 escape = false; 580 } else if (c == '\\') { 581 escape = true; 582 } else { 583 sb.append(c); 584 } 585 } 586 587 // If string ended with a dangling backslash, keep it literally 588 if (escape) 589 sb.append('\\'); 590 591 return sb.toString(); 592 } 593 594 return rawValue; 595 } 596 597 /** 598 * Normalizes a URL or path into a canonical request path and optionally performs percent-decoding on the path. 599 * <p> 600 * For example, {@code "https://www.soklet.com/ab%20c?one=two"} would be normalized to {@code "/ab c"}. 601 * <p> 602 * The {@code OPTIONS *} special case returns {@code "*"}. 603 * <p> 604 * Behavior: 605 * <ul> 606 * <li>If input starts with {@code http://} or {@code https://}, the path portion is extracted.</li> 607 * <li>Ensures the result begins with {@code '/'}.</li> 608 * <li>Removes any trailing {@code '/'} (except for the root path {@code '/'}).</li> 609 * <li>Safely normalizes path traversals, e.g. path {@code '/a/../b'} would be normalized to {@code '/b'}</li> 610 * <li>Strips any query string.</li> 611 * <li>Applies aggressive trimming of Unicode whitespace.</li> 612 * </ul> 613 * 614 * @param url a URL or path to normalize 615 * @param performDecoding {@code true} if decoding should be performed on the path (e.g. replace {@code %20} with a space character), {@code false} otherwise 616 * @return the normalized path, {@code "/"} for empty input 617 */ 618 @Nonnull 619 public static String extractPathFromUrl(@Nonnull String url, 620 @Nonnull Boolean performDecoding) { 621 requireNonNull(url); 622 623 url = trimAggressivelyToEmpty(url); 624 625 // Special case for OPTIONS * requests 626 if (url.equals("*")) 627 return "*"; 628 629 // Parse with java.net.URI to isolate raw path; then percent-decode only the path 630 try { 631 URI uri = new URI(url); 632 633 String rawPath = uri.getRawPath(); // null => "/" 634 635 if (rawPath == null || rawPath.isEmpty()) 636 rawPath = "/"; 637 638 String decodedPath = performDecoding ? percentDecode(rawPath, StandardCharsets.UTF_8) : rawPath; 639 640 // Sanitize path traversal (e.g. /a/../b -> /b) 641 decodedPath = removeDotSegments(decodedPath); 642 643 // Normalize trailing slashes like normalizedPathForUrl currently does 644 if (!decodedPath.startsWith("/")) 645 decodedPath = "/" + decodedPath; 646 647 if (!"/".equals(decodedPath)) 648 while (decodedPath.endsWith("/")) 649 decodedPath = decodedPath.substring(0, decodedPath.length() - 1); 650 651 return decodedPath; 652 } catch (URISyntaxException e) { 653 // If it's not an absolute URL, treat the whole string as a path and percent-decode 654 String path = url; 655 int q = path.indexOf('?'); 656 657 if (q != -1) 658 path = path.substring(0, q); 659 660 String decodedPath = performDecoding ? percentDecode(path, StandardCharsets.UTF_8) : path; 661 662 // Sanitize path traversal (e.g. /a/../b -> /b) 663 decodedPath = removeDotSegments(decodedPath); 664 665 if (!decodedPath.startsWith("/")) 666 decodedPath = "/" + decodedPath; 667 668 if (!"/".equals(decodedPath)) 669 while (decodedPath.endsWith("/")) 670 decodedPath = decodedPath.substring(0, decodedPath.length() - 1); 671 672 return decodedPath; 673 } 674 } 675 676 /** 677 * Extracts the raw (un-decoded) query component from a URL. 678 * <p> 679 * For example, {@code "/path?a=b&c=d%20e"} would return {@code "a=b&c=d%20e"}. 680 * 681 * @param url a raw URL or path 682 * @return the raw query component, or {@link Optional#empty()} if none 683 */ 684 @Nonnull 685 public static Optional<String> extractRawQueryFromUrl(@Nonnull String url) { 686 requireNonNull(url); 687 688 url = trimAggressivelyToEmpty(url); 689 690 if ("*".equals(url)) 691 return Optional.empty(); 692 693 try { 694 URI uri = new URI(url); 695 return Optional.ofNullable(trimAggressivelyToNull(uri.getRawQuery())); 696 } catch (URISyntaxException e) { 697 // Not a valid URI, try to extract query manually 698 int q = url.indexOf('?'); 699 if (q == -1) 700 return Optional.empty(); 701 702 String query = trimAggressivelyToNull(url.substring(q + 1)); 703 return Optional.ofNullable(query); 704 } 705 } 706 707 /** 708 * Encodes decoded query parameters into a raw query string. 709 * <p> 710 * For example, given {@code {a=[b], c=[d e]}} and {@link QueryFormat#RFC_3986_STRICT}, 711 * returns {@code "a=b&c=d%20e"}. 712 * 713 * @param queryParameters the decoded query parameters 714 * @param queryFormat the encoding strategy 715 * @return the encoded query string, or the empty string if no parameters 716 */ 717 @Nonnull 718 public static String encodeQueryParameters(@Nonnull Map<String, Set<String>> queryParameters, 719 @Nonnull QueryFormat queryFormat) { 720 requireNonNull(queryParameters); 721 requireNonNull(queryFormat); 722 723 if (queryParameters.isEmpty()) 724 return ""; 725 726 StringBuilder sb = new StringBuilder(); 727 boolean first = true; 728 729 for (Entry<String, Set<String>> entry : queryParameters.entrySet()) { 730 String encodedName = encodeQueryComponent(entry.getKey(), queryFormat); 731 732 for (String value : entry.getValue()) { 733 if (!first) 734 sb.append('&'); 735 736 sb.append(encodedName); 737 sb.append('='); 738 sb.append(encodeQueryComponent(value, queryFormat)); 739 740 first = false; 741 } 742 } 743 744 return sb.toString(); 745 } 746 747 @Nonnull 748 static String encodeQueryComponent(@Nonnull String queryComponent, 749 @Nonnull QueryFormat queryFormat) { 750 requireNonNull(queryComponent); 751 requireNonNull(queryFormat); 752 753 String encoded = URLEncoder.encode(queryComponent, StandardCharsets.UTF_8); 754 755 if (queryFormat == QueryFormat.RFC_3986_STRICT) 756 encoded = encoded.replace("+", "%20"); 757 758 return encoded; 759 } 760 761 @Nonnull 762 static String encodePath(@Nonnull String path) { 763 requireNonNull(path); 764 765 if ("*".equals(path)) 766 return path; 767 768 // Encode each path segment individually, preserving '/' separators. 769 // RFC 3986 is used for path encoding (spaces as %20, not +). 770 return Arrays.stream(path.split("/", -1)) 771 .map(segment -> URLEncoder.encode(segment, StandardCharsets.UTF_8).replace("+", "%20")) 772 .collect(Collectors.joining("/")); 773 } 774 775 /** 776 * Parses an {@code Accept-Language} header value into a best-effort ordered list of {@link Locale}s. 777 * <p> 778 * Quality weights are honored by {@link Locale.LanguageRange#parse(String)}; results are then mapped to available 779 * JVM locales. Unknown or unavailable language ranges are skipped. On parse failure, an empty list is returned. 780 * 781 * @param acceptLanguageHeaderValue the raw header value (must be non-{@code null}) 782 * @return locales in descending preference order; empty if none could be resolved 783 */ 784 @Nonnull 785 public static List<Locale> extractLocalesFromAcceptLanguageHeaderValue(@Nonnull String acceptLanguageHeaderValue) { 786 requireNonNull(acceptLanguageHeaderValue); 787 788 try { 789 List<LanguageRange> languageRanges = LanguageRange.parse(acceptLanguageHeaderValue); 790 791 return languageRanges.stream() 792 .map(languageRange -> LOCALES_BY_LANGUAGE_RANGE_RANGE.get(languageRange.getRange())) 793 .filter(locale -> locale != null) 794 .collect(Collectors.toList()); 795 } catch (Exception ignored) { 796 return List.of(); 797 } 798 } 799 800 /** 801 * Best-effort attempt to determine a client's URL prefix by examining request headers. 802 * <p> 803 * A URL prefix in this context is defined as {@code <scheme>://host<:optional port>}, but no path or query components. 804 * <p> 805 * Soklet is generally the "last hop" behind a load balancer/reverse proxy and does get accessed directly by clients. 806 * <p> 807 * Normally a load balancer/reverse proxy/other upstream proxies will provide information about the true source of the 808 * request through headers like the following: 809 * <ul> 810 * <li>{@code Host}</li> 811 * <li>{@code Forwarded}</li> 812 * <li>{@code Origin}</li> 813 * <li>{@code X-Forwarded-Proto}</li> 814 * <li>{@code X-Forwarded-Protocol}</li> 815 * <li>{@code X-Url-Scheme}</li> 816 * <li>{@code Front-End-Https}</li> 817 * <li>{@code X-Forwarded-Ssl}</li> 818 * <li>{@code X-Forwarded-Host}</li> 819 * <li>{@code X-Forwarded-Port}</li> 820 * </ul> 821 * <p> 822 * This method may take these and other headers into account when determining URL prefix. 823 * <p> 824 * For example, the following would be legal URL prefixes returned from this method: 825 * <ul> 826 * <li>{@code https://www.soklet.com}</li> 827 * <li>{@code http://www.fake.com:1234}</li> 828 * </ul> 829 * <p> 830 * The following would NOT be legal URL prefixes: 831 * <ul> 832 * <li>{@code www.soklet.com} (missing protocol) </li> 833 * <li>{@code https://www.soklet.com/} (trailing slash)</li> 834 * <li>{@code https://www.soklet.com/test} (trailing slash, path)</li> 835 * <li>{@code https://www.soklet.com/test?abc=1234} (trailing slash, path, query)</li> 836 * </ul> 837 * 838 * @param headers HTTP request headers 839 * @return the URL prefix, or {@link Optional#empty()} if it could not be determined 840 */ 841 @Nonnull 842 public static Optional<String> extractClientUrlPrefixFromHeaders(@Nonnull Map<String, Set<String>> headers) { 843 requireNonNull(headers); 844 845 // Host developer.mozilla.org OR developer.mozilla.org:443 OR [2001:db8::1]:8443 846 // Forwarded by=<identifier>;for=<identifier>;host=<host>;proto=<http|https> (can be repeated if comma-separated, e.g. for=12.34.56.78;host=example.com;proto=https, for=23.45.67.89) 847 // Origin null OR <scheme>://<hostname> OR <scheme>://<hostname>:<port> 848 // X-Forwarded-Proto https 849 // X-Forwarded-Protocol https (Microsoft's alternate name) 850 // X-Url-Scheme https (Microsoft's alternate name) 851 // Front-End-Https on (Microsoft's alternate name) 852 // X-Forwarded-Ssl on (Microsoft's alternate name) 853 // X-Forwarded-Host id42.example-cdn.com 854 // X-Forwarded-Port 443 855 856 String protocol = null; 857 String host = null; 858 String portAsString = null; 859 Boolean portExplicit = false; 860 861 // Host: developer.mozilla.org OR developer.mozilla.org:443 OR [2001:db8::1]:8443 862 Set<String> hostHeaders = headers.get("Host"); 863 864 if (hostHeaders != null && !hostHeaders.isEmpty()) { 865 HostPort hostPort = parseHostPort(hostHeaders.iterator().next()).orElse(null); 866 867 if (hostPort != null) { 868 host = hostPort.getHost(); 869 870 if (hostPort.getPort().isPresent()) { 871 portAsString = String.valueOf(hostPort.getPort().get()); 872 portExplicit = true; 873 } 874 } 875 } 876 877 // Forwarded: by=<identifier>;for=<identifier>;host=<host>;proto=<http|https> 878 Set<String> forwardedHeaders = headers.get("Forwarded"); 879 if (forwardedHeaders != null && forwardedHeaders.size() > 0) { 880 String forwardedHeader = trimAggressivelyToNull(forwardedHeaders.stream().findFirst().get()); 881 882 // If there are multiple comma-separated components, pick the first one 883 String[] forwardedHeaderComponents = forwardedHeader != null ? forwardedHeader.split(",") : new String[0]; 884 forwardedHeader = forwardedHeaderComponents.length > 0 ? trimAggressivelyToNull(forwardedHeaderComponents[0]) : null; 885 886 if (forwardedHeader != null) { 887 // Each field component might look like "by=<identifier>" 888 String[] forwardedHeaderFieldComponents = forwardedHeader.split(";"); 889 for (String forwardedHeaderFieldComponent : forwardedHeaderFieldComponents) { 890 forwardedHeaderFieldComponent = trimAggressivelyToNull(forwardedHeaderFieldComponent); 891 if (forwardedHeaderFieldComponent == null) 892 continue; 893 894 // Break "by=<identifier>" into "by" and "<identifier>" pieces 895 String[] forwardedHeaderFieldNameAndValue = forwardedHeaderFieldComponent.split(Pattern.quote("=" /* escape special Regex char */)); 896 if (forwardedHeaderFieldNameAndValue.length != 2) 897 continue; 898 899 String name = trimAggressivelyToNull(forwardedHeaderFieldNameAndValue[0]); 900 String value = trimAggressivelyToNull(forwardedHeaderFieldNameAndValue[1]); 901 if (name == null || value == null) 902 continue; 903 904 if ("host".equalsIgnoreCase(name)) { 905 if (host == null) { 906 HostPort hostPort = parseHostPort(value).orElse(null); 907 908 if (hostPort != null) { 909 host = hostPort.getHost(); 910 911 if (hostPort.getPort().isPresent()) { 912 portAsString = String.valueOf(hostPort.getPort().get()); 913 portExplicit = true; 914 } 915 } 916 } 917 } else if ("proto".equalsIgnoreCase(name)) { 918 if (protocol == null) 919 protocol = stripOptionalQuotes(value); 920 } 921 } 922 } 923 } 924 925 // Origin: null OR <scheme>://<hostname> OR <scheme>://<hostname>:<port> (IPv6 supported) 926 if (protocol == null || host == null || portAsString == null) { 927 Set<String> originHeaders = headers.get("Origin"); 928 929 if (originHeaders != null && !originHeaders.isEmpty()) { 930 String originHeader = trimAggressivelyToNull(originHeaders.iterator().next()); 931 try { 932 URI o = new URI(originHeader); 933 String sch = trimAggressivelyToNull(o.getScheme()); 934 String h = o.getHost(); // may be bracketed already on some JDKs 935 int p = o.getPort(); // -1 if absent 936 937 if (sch != null) 938 protocol = sch; 939 940 if (h != null) { 941 boolean alreadyBracketed = h.startsWith("[") && h.endsWith("]"); 942 boolean isIpv6Like = h.indexOf(':') >= 0; // contains colon(s) 943 host = (isIpv6Like && !alreadyBracketed) ? "[" + h + "]" : h; 944 } 945 946 if (p >= 0) { 947 portAsString = String.valueOf(p); 948 portExplicit = true; 949 } 950 } catch (URISyntaxException ignored) { 951 // no-op 952 } 953 } 954 } 955 956 // X-Forwarded-Proto: https 957 if (protocol == null) { 958 Set<String> xForwardedProtoHeaders = headers.get("X-Forwarded-Proto"); 959 if (xForwardedProtoHeaders != null && xForwardedProtoHeaders.size() > 0) { 960 String xForwardedProtoHeader = trimAggressivelyToNull(xForwardedProtoHeaders.stream().findFirst().get()); 961 protocol = xForwardedProtoHeader; 962 } 963 } 964 965 // X-Forwarded-Protocol: https (Microsoft's alternate name) 966 if (protocol == null) { 967 Set<String> xForwardedProtocolHeaders = headers.get("X-Forwarded-Protocol"); 968 if (xForwardedProtocolHeaders != null && xForwardedProtocolHeaders.size() > 0) { 969 String xForwardedProtocolHeader = trimAggressivelyToNull(xForwardedProtocolHeaders.stream().findFirst().get()); 970 protocol = xForwardedProtocolHeader; 971 } 972 } 973 974 // X-Url-Scheme: https (Microsoft's alternate name) 975 if (protocol == null) { 976 Set<String> xUrlSchemeHeaders = headers.get("X-Url-Scheme"); 977 if (xUrlSchemeHeaders != null && xUrlSchemeHeaders.size() > 0) { 978 String xUrlSchemeHeader = trimAggressivelyToNull(xUrlSchemeHeaders.stream().findFirst().get()); 979 protocol = xUrlSchemeHeader; 980 } 981 } 982 983 // Front-End-Https: on (Microsoft's alternate name) 984 if (protocol == null) { 985 Set<String> frontEndHttpsHeaders = headers.get("Front-End-Https"); 986 if (frontEndHttpsHeaders != null && frontEndHttpsHeaders.size() > 0) { 987 String frontEndHttpsHeader = trimAggressivelyToNull(frontEndHttpsHeaders.stream().findFirst().get()); 988 989 if (frontEndHttpsHeader != null) 990 protocol = "on".equalsIgnoreCase(frontEndHttpsHeader) ? "https" : "http"; 991 } 992 } 993 994 // X-Forwarded-Ssl: on (Microsoft's alternate name) 995 if (protocol == null) { 996 Set<String> xForwardedSslHeaders = headers.get("X-Forwarded-Ssl"); 997 if (xForwardedSslHeaders != null && xForwardedSslHeaders.size() > 0) { 998 String xForwardedSslHeader = trimAggressivelyToNull(xForwardedSslHeaders.stream().findFirst().get()); 999 1000 if (xForwardedSslHeader != null) 1001 protocol = "on".equalsIgnoreCase(xForwardedSslHeader) ? "https" : "http"; 1002 } 1003 } 1004 1005 // X-Forwarded-Host: id42.example-cdn.com (or with port / IPv6) 1006 if (host == null) { 1007 Set<String> xForwardedHostHeaders = headers.get("X-Forwarded-Host"); 1008 if (xForwardedHostHeaders != null && xForwardedHostHeaders.size() > 0) { 1009 HostPort hostPort = parseHostPort(xForwardedHostHeaders.iterator().next()).orElse(null); 1010 1011 if (hostPort != null) { 1012 host = hostPort.getHost(); 1013 1014 if (hostPort.getPort().isPresent() && portAsString == null) { 1015 portAsString = String.valueOf(hostPort.getPort().get()); 1016 portExplicit = true; 1017 } 1018 } 1019 } 1020 } 1021 1022 // X-Forwarded-Port: 443 1023 if (portAsString == null) { 1024 Set<String> xForwardedPortHeaders = headers.get("X-Forwarded-Port"); 1025 if (xForwardedPortHeaders != null && xForwardedPortHeaders.size() > 0) { 1026 String xForwardedPortHeader = trimAggressivelyToNull(xForwardedPortHeaders.stream().findFirst().get()); 1027 portAsString = xForwardedPortHeader; 1028 1029 if (xForwardedPortHeader != null) 1030 portExplicit = true; 1031 } 1032 } 1033 1034 Integer port = null; 1035 1036 if (portAsString != null) { 1037 try { 1038 port = Integer.parseInt(portAsString, 10); 1039 } catch (Exception ignored) { 1040 // Not an integer; ignore it 1041 } 1042 } 1043 1044 if (protocol != null && host != null && port == null) { 1045 return Optional.of(format("%s://%s", protocol, host)); 1046 } 1047 1048 if (protocol != null && host != null && port != null) { 1049 boolean usingDefaultPort = 1050 ("http".equalsIgnoreCase(protocol) && port.equals(80)) || 1051 ("https".equalsIgnoreCase(protocol) && port.equals(443)); 1052 1053 // Keep default ports if the client/proxy explicitly sent them 1054 String clientUrlPrefix = (usingDefaultPort && !portExplicit) 1055 ? format("%s://%s", protocol, host) 1056 : format("%s://%s:%s", protocol, host, port); 1057 1058 return Optional.of(clientUrlPrefix); 1059 } 1060 1061 return Optional.empty(); 1062 } 1063 1064 /** 1065 * Extracts the media type (without parameters) from the first {@code Content-Type} header. 1066 * <p> 1067 * For example, {@code "text/html; charset=UTF-8"} → {@code "text/html"}. 1068 * 1069 * @param headers request/response headers (must be non-{@code null}) 1070 * @return the media type if present; otherwise {@link Optional#empty()} 1071 * @see #extractContentTypeFromHeaderValue(String) 1072 */ 1073 @Nonnull 1074 public static Optional<String> extractContentTypeFromHeaders(@Nonnull Map<String, Set<String>> headers) { 1075 requireNonNull(headers); 1076 1077 Set<String> contentTypeHeaderValues = headers.get("Content-Type"); 1078 1079 if (contentTypeHeaderValues == null || contentTypeHeaderValues.size() == 0) 1080 return Optional.empty(); 1081 1082 return extractContentTypeFromHeaderValue(contentTypeHeaderValues.stream().findFirst().get()); 1083 } 1084 1085 /** 1086 * Extracts the media type (without parameters) from a {@code Content-Type} header value. 1087 * <p> 1088 * For example, {@code "application/json; charset=UTF-8"} → {@code "application/json"}. 1089 * 1090 * @param contentTypeHeaderValue the raw header value; may be {@code null} or blank 1091 * @return the media type if present; otherwise {@link Optional#empty()} 1092 */ 1093 @Nonnull 1094 public static Optional<String> extractContentTypeFromHeaderValue(@Nullable String contentTypeHeaderValue) { 1095 contentTypeHeaderValue = trimAggressivelyToNull(contentTypeHeaderValue); 1096 1097 if (contentTypeHeaderValue == null) 1098 return Optional.empty(); 1099 1100 // Examples 1101 // Content-Type: text/html; charset=UTF-8 1102 // Content-Type: multipart/form-data; boundary=something 1103 1104 int indexOfSemicolon = contentTypeHeaderValue.indexOf(";"); 1105 1106 // Simple case, e.g. "text/html" 1107 if (indexOfSemicolon == -1) 1108 return Optional.ofNullable(trimAggressivelyToNull(contentTypeHeaderValue)); 1109 1110 // More complex case, e.g. "text/html; charset=UTF-8" 1111 return Optional.ofNullable(trimAggressivelyToNull(contentTypeHeaderValue.substring(0, indexOfSemicolon))); 1112 } 1113 1114 /** 1115 * Extracts the {@link Charset} from the first {@code Content-Type} header, if present and valid. 1116 * <p> 1117 * Tolerates additional parameters and arbitrary whitespace. Invalid or unknown charset tokens yield {@link Optional#empty()}. 1118 * 1119 * @param headers request/response headers (must be non-{@code null}) 1120 * @return the charset declared by the header; otherwise {@link Optional#empty()} 1121 * @see #extractCharsetFromHeaderValue(String) 1122 */ 1123 @Nonnull 1124 public static Optional<Charset> extractCharsetFromHeaders(@Nonnull Map<String, Set<String>> headers) { 1125 requireNonNull(headers); 1126 1127 Set<String> contentTypeHeaderValues = headers.get("Content-Type"); 1128 1129 if (contentTypeHeaderValues == null || contentTypeHeaderValues.size() == 0) 1130 return Optional.empty(); 1131 1132 return extractCharsetFromHeaderValue(contentTypeHeaderValues.stream().findFirst().get()); 1133 } 1134 1135 /** 1136 * Extracts the {@code charset=...} parameter from a {@code Content-Type} header value. 1137 * <p> 1138 * Parsing is forgiving: parameters may appear in any order and with arbitrary spacing. If a charset is found, 1139 * it is validated via {@link Charset#forName(String)}; invalid names result in {@link Optional#empty()}. 1140 * 1141 * @param contentTypeHeaderValue the raw header value; may be {@code null} or blank 1142 * @return the resolved charset if present and valid; otherwise {@link Optional#empty()} 1143 */ 1144 @Nonnull 1145 public static Optional<Charset> extractCharsetFromHeaderValue(@Nullable String contentTypeHeaderValue) { 1146 contentTypeHeaderValue = trimAggressivelyToNull(contentTypeHeaderValue); 1147 1148 if (contentTypeHeaderValue == null) 1149 return Optional.empty(); 1150 1151 // Examples 1152 // Content-Type: text/html; charset=UTF-8 1153 // Content-Type: multipart/form-data; boundary=something 1154 1155 int indexOfSemicolon = contentTypeHeaderValue.indexOf(";"); 1156 1157 // Simple case, e.g. "text/html" 1158 if (indexOfSemicolon == -1) 1159 return Optional.empty(); 1160 1161 // More complex case, e.g. "text/html; charset=UTF-8" or "multipart/form-data; charset=UTF-8; boundary=something" 1162 boolean finishedContentType = false; 1163 boolean finishedCharsetName = false; 1164 StringBuilder buffer = new StringBuilder(); 1165 String charsetName = null; 1166 1167 for (int i = 0; i < contentTypeHeaderValue.length(); i++) { 1168 char c = contentTypeHeaderValue.charAt(i); 1169 1170 if (Character.isWhitespace(c)) 1171 continue; 1172 1173 if (c == ';') { 1174 // No content type yet? This just be it... 1175 if (!finishedContentType) { 1176 finishedContentType = true; 1177 buffer = new StringBuilder(); 1178 } else if (!finishedCharsetName) { 1179 if (buffer.indexOf("charset=") == 0) { 1180 charsetName = buffer.toString(); 1181 finishedCharsetName = true; 1182 break; 1183 } 1184 } 1185 } else { 1186 buffer.append(Character.toLowerCase(c)); 1187 } 1188 } 1189 1190 // Handle case where charset is the end of the string, e.g. "whatever;charset=UTF-8" 1191 if (!finishedCharsetName) { 1192 String potentialCharset = trimAggressivelyToNull(buffer.toString()); 1193 if (potentialCharset != null && potentialCharset.startsWith("charset=")) { 1194 finishedCharsetName = true; 1195 charsetName = potentialCharset; 1196 } 1197 } 1198 1199 if (finishedCharsetName) { 1200 // e.g. charset=UTF-8 or charset="UTF-8" or charset='UTF-8' 1201 String possibleCharsetName = trimAggressivelyToNull(charsetName.replace("charset=", "")); 1202 1203 if (possibleCharsetName != null) { 1204 // strip optional surrounding quotes 1205 if ((possibleCharsetName.length() >= 2) && 1206 ((possibleCharsetName.charAt(0) == '"' && possibleCharsetName.charAt(possibleCharsetName.length() - 1) == '"') || 1207 (possibleCharsetName.charAt(0) == '\'' && possibleCharsetName.charAt(possibleCharsetName.length() - 1) == '\''))) { 1208 possibleCharsetName = possibleCharsetName.substring(1, possibleCharsetName.length() - 1); 1209 possibleCharsetName = trimAggressivelyToNull(possibleCharsetName); 1210 } 1211 1212 if (possibleCharsetName != null) { 1213 try { 1214 return Optional.of(Charset.forName(possibleCharsetName)); 1215 } catch (IllegalCharsetNameException | UnsupportedCharsetException ignored) { 1216 return Optional.empty(); 1217 } 1218 } 1219 } 1220 } 1221 1222 return Optional.empty(); 1223 } 1224 1225 /** 1226 * A "stronger" version of {@link String#trim()} which discards any kind of whitespace or invisible separator. 1227 * <p> 1228 * In a web environment with user-supplied inputs, this is the behavior we want the vast majority of the time. 1229 * For example, users copy-paste URLs from Microsoft Word or Outlook and it's easy to accidentally include a {@code U+202F 1230 * "Narrow No-Break Space (NNBSP)"} character at the end, which might break parsing. 1231 * <p> 1232 * See <a href="https://www.compart.com/en/unicode/U+202F">https://www.compart.com/en/unicode/U+202F</a> for details. 1233 * 1234 * @param string the string to trim 1235 * @return the trimmed string, or {@code null} if the input string is {@code null} or the trimmed representation is of length {@code 0} 1236 */ 1237 @Nullable 1238 public static String trimAggressively(@Nullable String string) { 1239 if (string == null) 1240 return null; 1241 1242 string = HEAD_WHITESPACE_PATTERN.matcher(string).replaceAll(""); 1243 1244 if (string.length() == 0) 1245 return string; 1246 1247 string = TAIL_WHITESPACE_PATTERN.matcher(string).replaceAll(""); 1248 1249 return string; 1250 } 1251 1252 /** 1253 * Aggressively trims Unicode whitespace from the given string and returns {@code null} if the result is empty. 1254 * <p> 1255 * See {@link #trimAggressively(String)} for details on which code points are removed. 1256 * 1257 * @param string the input string; may be {@code null} 1258 * @return a trimmed, non-empty string; or {@code null} if input was {@code null} or trimmed to empty 1259 */ 1260 @Nullable 1261 public static String trimAggressivelyToNull(@Nullable String string) { 1262 if (string == null) 1263 return null; 1264 1265 string = trimAggressively(string); 1266 return string.length() == 0 ? null : string; 1267 } 1268 1269 /** 1270 * Aggressively trims Unicode whitespace from the given string and returns {@code ""} if the input is {@code null}. 1271 * <p> 1272 * See {@link #trimAggressively(String)} for details on which code points are removed. 1273 * 1274 * @param string the input string; may be {@code null} 1275 * @return a trimmed string (never {@code null}); {@code ""} if input was {@code null} 1276 */ 1277 @Nonnull 1278 public static String trimAggressivelyToEmpty(@Nullable String string) { 1279 if (string == null) 1280 return ""; 1281 1282 return trimAggressively(string); 1283 } 1284 1285 static void validateHeaderNameAndValue(@Nullable String name, 1286 @Nullable String value) { 1287 // First, validate name: 1288 name = trimAggressivelyToNull(name); 1289 1290 if (name == null) 1291 throw new IllegalArgumentException("Header name is blank"); 1292 1293 for (int i = 0; i < name.length(); i++) { 1294 char c = name.charAt(i); 1295 // RFC 9110 tchar: "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA 1296 if (!(c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || c == '\'' || c == '*' || c == '+' || 1297 c == '-' || c == '.' || c == '^' || c == '_' || c == '`' || c == '|' || c == '~' || 1298 Character.isLetterOrDigit(c))) { 1299 throw new IllegalArgumentException(format("Illegal header name '%s'. Offending character: '%s'", name, printableChar(c))); 1300 } 1301 } 1302 1303 // Then, validate value: 1304 if (value == null) 1305 return; 1306 1307 for (int i = 0; i < value.length(); i++) { 1308 char c = value.charAt(i); 1309 if (c == '\r' || c == '\n' || c == 0x00 || (c >= 0x00 && c < 0x20 && c != '\t')) { 1310 throw new IllegalArgumentException(format("Illegal header value '%s' for header name '%s'. Offending character: '%s'", value, name, printableChar(c))); 1311 } 1312 } 1313 1314 // Percent-encoded control sequence checks 1315 Matcher m = HEADER_PERCENT_ENCODING_PATTERN.matcher(value); 1316 1317 while (m.find()) { 1318 int b = Integer.parseInt(m.group(1), 16); 1319 if (b == 0x0D || b == 0x0A || b == 0x00 || (b >= 0x00 && b < 0x20 && b != 0x09)) { 1320 throw new IllegalArgumentException(format( 1321 "Illegal (percent-encoded) header value '%s' for header name '%s'. Offending octet: 0x%02X", 1322 value, name, b)); 1323 } 1324 } 1325 } 1326 1327 @Nonnull 1328 static String printableString(@Nonnull String input) { 1329 requireNonNull(input); 1330 1331 StringBuilder out = new StringBuilder(input.length() + 16); 1332 1333 for (int i = 0; i < input.length(); i++) 1334 out.append(printableChar(input.charAt(i))); 1335 1336 return out.toString(); 1337 } 1338 1339 @Nonnull 1340 static String printableChar(char c) { 1341 if (c == '\r') return "\\r"; 1342 if (c == '\n') return "\\n"; 1343 if (c == '\t') return "\\t"; 1344 if (c == '\f') return "\\f"; 1345 if (c == '\b') return "\\b"; 1346 if (c == '\\') return "\\\\"; 1347 if (c == '\'') return "\\'"; 1348 if (c == '\"') return "\\\""; 1349 if (c == 0) return "\\0"; 1350 1351 if (c < 0x20 || c == 0x7F) // control chars 1352 return String.format("\\u%04X", (int) c); 1353 1354 if (Character.isISOControl(c) || Character.getType(c) == Character.FORMAT) 1355 return String.format("\\u%04X", (int) c); 1356 1357 return String.valueOf(c); 1358 } 1359 1360 @Nonnull 1361 private static final Set<String> COMMA_JOINABLE_HEADER_NAMES = Set.of( 1362 // Common list-type headers (RFC 7230/9110) 1363 "accept", 1364 "accept-encoding", 1365 "accept-language", 1366 "cache-control", 1367 "pragma", 1368 "vary", 1369 "connection", 1370 "transfer-encoding", 1371 "upgrade", 1372 "allow", 1373 "via", 1374 "warning" 1375 // intentionally NOT: set-cookie, authorization, cookie, content-disposition, location 1376 ); 1377 1378 /** 1379 * Given a list of raw HTTP header lines, convert them into a normalized case-insensitive, order-preserving map which "inflates" comma-separated headers into distinct values where permitted according to RFC 7230/9110. 1380 * <p> 1381 * For example, given these raw header lines: 1382 * <pre>{@code List<String> lines = List.of( 1383 * "Cache-Control: no-cache, no-store", 1384 * "Set-Cookie: a=b; Path=/; HttpOnly", 1385 * "Set-Cookie: c=d; Expires=Wed, 21 Oct 2015 07:28:00 GMT; Path=/" 1386 * );}</pre> 1387 * The result of parsing would look like this: 1388 * <pre>{@code result.get("cache-control") -> [ 1389 * "no-cache", 1390 * "no-store" 1391 * ] 1392 * result.get("set-cookie") -> [ 1393 * "a=b; Path=/; HttpOnly", 1394 * "c=d; Expires=Wed, 21 Oct 2015 07:28:00 GMT; Path=/" 1395 * ]}</pre> 1396 * <p> 1397 * Keys in the returned map are case-insensitive and are guaranteed to be in the same order as encountered in {@code rawHeaderLines}. 1398 * <p> 1399 * Values in the returned map are guaranteed to be in the same order as encountered in {@code rawHeaderLines}. 1400 * 1401 * @param rawHeaderLines the raw HTTP header lines to parse 1402 * @return a normalized mapping of header name keys to values 1403 */ 1404 @Nonnull 1405 public static Map<String, Set<String>> extractHeadersFromRawHeaderLines(@Nonnull List<String> rawHeaderLines) { 1406 requireNonNull(rawHeaderLines); 1407 1408 // 1) Unfold obsolete folded lines (obs-fold): lines beginning with SP/HT are continuations 1409 List<String> lines = unfold(rawHeaderLines); 1410 1411 // 2) Parse into map 1412 Map<String, Set<String>> headers = new LinkedCaseInsensitiveMap<>(); 1413 1414 for (String raw : lines) { 1415 String line = trimAggressivelyToNull(raw); 1416 1417 if (line == null) 1418 continue; 1419 1420 int idx = line.indexOf(':'); 1421 1422 if (idx <= 0) 1423 continue; // skip malformed 1424 1425 String key = trimAggressivelyToEmpty(line.substring(0, idx)); // keep original case for display 1426 String keyLowercase = key.toLowerCase(Locale.ROOT); 1427 String value = trimAggressivelyToNull(line.substring(idx + 1)); 1428 1429 if (value == null) 1430 continue; 1431 1432 Set<String> bucket = headers.computeIfAbsent(key, k -> new LinkedHashSet<>()); 1433 1434 if (COMMA_JOINABLE_HEADER_NAMES.contains(keyLowercase)) { 1435 for (String part : splitCommaAware(value)) { 1436 String v = trimAggressivelyToNull(part); 1437 if (v != null) 1438 bucket.add(v); 1439 } 1440 } else { 1441 bucket.add(value.trim()); 1442 } 1443 } 1444 1445 return headers; 1446 } 1447 1448 /** 1449 * Header parsing helper 1450 */ 1451 @Nonnull 1452 private static List<String> unfold(@Nonnull List<String> raw) { 1453 requireNonNull(raw); 1454 if (raw.isEmpty()) return List.of(); 1455 1456 List<String> out = new ArrayList<>(raw.size()); 1457 StringBuilder cur = null; 1458 boolean curIsHeader = false; 1459 1460 for (String line : raw) { 1461 if (line == null) continue; 1462 1463 boolean isContinuation = !line.isEmpty() && (line.charAt(0) == ' ' || line.charAt(0) == '\t'); 1464 if (isContinuation) { 1465 if (cur != null && curIsHeader) { 1466 cur.append(' ').append(line.trim()); 1467 } else { 1468 // Do not fold into a non-header; flush previous and start anew 1469 if (cur != null) out.add(cur.toString()); 1470 cur = new StringBuilder(line); 1471 curIsHeader = line.indexOf(':') > 0; // almost certainly false for leading-space lines 1472 } 1473 } else { 1474 if (cur != null) out.add(cur.toString()); 1475 cur = new StringBuilder(line); 1476 curIsHeader = line.indexOf(':') > 0; 1477 } 1478 } 1479 if (cur != null) out.add(cur.toString()); 1480 return out; 1481 } 1482 1483 /** 1484 * Header parsing helper: split on commas that are not inside a quoted-string; supports \" escapes inside quotes. 1485 */ 1486 @Nonnull 1487 private static List<String> splitCommaAware(@Nonnull String string) { 1488 requireNonNull(string); 1489 1490 List<String> out = new ArrayList<>(4); 1491 StringBuilder cur = new StringBuilder(); 1492 boolean inQuotes = false; 1493 boolean escaped = false; 1494 1495 for (int i = 0; i < string.length(); i++) { 1496 char c = string.charAt(i); 1497 1498 if (escaped) { 1499 // Preserve the escaped char as-is 1500 cur.append(c); 1501 escaped = false; 1502 } else if (c == '\\') { 1503 if (inQuotes) { 1504 // Preserve the backslash itself, then mark next char as escaped 1505 cur.append('\\'); // ← keep the backslash 1506 escaped = true; 1507 } else { 1508 cur.append('\\'); // literal backslash outside quotes 1509 } 1510 } else if (c == '"') { 1511 inQuotes = !inQuotes; 1512 cur.append('"'); 1513 } else if (c == ',' && !inQuotes) { 1514 out.add(cur.toString()); 1515 cur.setLength(0); 1516 } else { 1517 cur.append(c); 1518 } 1519 } 1520 out.add(cur.toString()); 1521 return out; 1522 } 1523 1524 /** 1525 * Remove a single pair of surrounding quotes if present. 1526 */ 1527 @Nonnull 1528 private static String stripOptionalQuotes(@Nonnull String string) { 1529 requireNonNull(string); 1530 1531 if (string.length() >= 2) { 1532 char first = string.charAt(0), last = string.charAt(string.length() - 1); 1533 1534 if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) 1535 return string.substring(1, string.length() - 1); 1536 } 1537 1538 return string; 1539 } 1540 1541 /** 1542 * Parse host[:port] with IPv6 support: "[v6](:port)?" or "host(:port)?". 1543 * Returns host (with brackets for v6) and port (nullable). 1544 */ 1545 @ThreadSafe 1546 private static final class HostPort { 1547 @Nonnull 1548 private final String host; 1549 @Nullable 1550 private final Integer port; 1551 1552 HostPort(@Nonnull String host, 1553 @Nullable Integer port) { 1554 this.host = host; 1555 this.port = port; 1556 } 1557 1558 @Nonnull 1559 public String getHost() { 1560 return this.host; 1561 } 1562 1563 @Nonnull 1564 public Optional<Integer> getPort() { 1565 return Optional.ofNullable(this.port); 1566 } 1567 } 1568 1569 @Nonnull 1570 private static Optional<HostPort> parseHostPort(@Nullable String input) { 1571 input = trimAggressivelyToNull(input); 1572 1573 if (input == null) 1574 return Optional.empty(); 1575 1576 input = stripOptionalQuotes(input); 1577 1578 if (input.startsWith("[")) { 1579 int close = input.indexOf(']'); 1580 1581 if (close > 0) { 1582 String core = input.substring(1, close); // IPv6 literal without brackets 1583 String rest = input.substring(close + 1); // maybe ":port" 1584 String host = "[" + core + "]"; 1585 Integer port = null; 1586 1587 if (rest.startsWith(":")) { 1588 String ps = trimAggressivelyToNull(rest.substring(1)); 1589 if (ps != null) { 1590 try { 1591 port = Integer.parseInt(ps, 10); 1592 } catch (Exception ignored) { 1593 // Nothing to do 1594 } 1595 } 1596 } 1597 1598 return Optional.of(new HostPort(host, port)); 1599 } 1600 } 1601 1602 int colon = input.indexOf(':'); 1603 1604 if (colon > 0 && input.indexOf(':', colon + 1) == -1) { 1605 // exactly one ':' -> host:port (IPv4/hostname) 1606 String h = trimAggressivelyToNull(input.substring(0, colon)); 1607 String ps = trimAggressivelyToNull(input.substring(colon + 1)); 1608 Integer p = null; 1609 1610 if (ps != null) { 1611 try { 1612 p = Integer.parseInt(ps, 10); 1613 } catch (Exception ignored) { 1614 // Nothing to do 1615 } 1616 } 1617 if (h != null) 1618 return Optional.of(new HostPort(h, p)); 1619 } 1620 1621 // no port 1622 return Optional.of(new HostPort(input, null)); 1623 } 1624 1625 @Nonnull 1626 private static String removeDotSegments(@Nonnull String path) { 1627 requireNonNull(path); 1628 1629 Deque<String> stack = new ArrayDeque<>(); 1630 1631 for (String seg : path.split("/")) { 1632 if (seg.isEmpty() || ".".equals(seg)) 1633 continue; 1634 1635 if ("..".equals(seg)) { 1636 if (!stack.isEmpty()) 1637 stack.removeLast(); 1638 } else { 1639 stack.addLast(seg); 1640 } 1641 } 1642 1643 return "/" + String.join("/", stack); 1644 } 1645}