001/* 002 * Copyright 2022-2025 Revetware LLC. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Some of the code below is sourced from the Apache Tomcat fork of Apache commons-fileupload. 017 * See https://github.com/apache/tomcat for the original. 018 * It is also licensed under the terms of the Apache License, Version 2.0. 019 */ 020 021package com.soklet.core.impl; 022 023import com.soklet.core.MultipartField; 024import com.soklet.core.MultipartParser; 025import com.soklet.core.Request; 026import com.soklet.core.Utilities; 027import com.soklet.exception.MissingRequestHeaderException; 028import com.soklet.internal.spring.LinkedCaseInsensitiveMap; 029 030import javax.annotation.Nonnull; 031import javax.annotation.concurrent.ThreadSafe; 032import java.io.ByteArrayInputStream; 033import java.io.ByteArrayOutputStream; 034import java.io.Closeable; 035import java.io.IOException; 036import java.io.InputStream; 037import java.io.OutputStream; 038import java.io.UncheckedIOException; 039import java.io.UnsupportedEncodingException; 040import java.nio.charset.Charset; 041import java.nio.charset.StandardCharsets; 042import java.text.ParseException; 043import java.util.Base64; 044import java.util.HashMap; 045import java.util.Iterator; 046import java.util.LinkedHashMap; 047import java.util.LinkedHashSet; 048import java.util.Locale; 049import java.util.Map; 050import java.util.Set; 051 052import static com.soklet.core.Utilities.trimAggressivelyToNull; 053 054/** 055 * @author <a href="https://www.revetkn.com">Mark Allen</a> 056 */ 057@ThreadSafe 058public class DefaultMultipartParser implements MultipartParser { 059 @Nonnull 060 private static final DefaultMultipartParser SHARED_INSTANCE; 061 062 static { 063 SHARED_INSTANCE = new DefaultMultipartParser(); 064 } 065 066 @Nonnull 067 public static DefaultMultipartParser sharedInstance() { 068 return SHARED_INSTANCE; 069 } 070 071 @Override 072 @Nonnull 073 public Map<String, Set<MultipartField>> extractMultipartFields(@Nonnull Request request) { 074 byte[] requestBody = request.getBody().orElse(null); 075 076 if (requestBody == null) 077 return Map.of(); 078 079 // Required for embedded commons-upload code 080 MultipartStream.ProgressNotifier progressNotifier = new MultipartStream.ProgressNotifier(new ProgressListener() { 081 @Override 082 public void update(long bytesRead, long contentLength, int items) { 083 // Ignored for now 084 } 085 }, requestBody.length) { 086 @Override 087 void noteBytesRead(int pBytes) { 088 // Ignored for now 089 } 090 091 @Override 092 public void noteItem() { 093 // Ignored for now 094 } 095 }; 096 097 String contentTypeHeader = request.getHeader("Content-Type").orElse(null); 098 099 if (contentTypeHeader == null) 100 throw new MissingRequestHeaderException("The 'Content-Type' header must be specified for multipart requests.", "Content-Type"); 101 102 Map<String, String> contentTypeHeaderFields = extractFields(contentTypeHeader); 103 Map<String, Set<MultipartField>> multipartFieldsByName = new LinkedHashMap<>(); 104 105 try (ByteArrayInputStream input = new ByteArrayInputStream(requestBody)) { 106 MultipartStream multipartStream = new MultipartStream(input, contentTypeHeaderFields.get("boundary").getBytes(), progressNotifier); 107 108 boolean hasNext = multipartStream.skipPreamble(); 109 110 while (hasNext) { 111 // Example headers: 112 // 113 // Content-Disposition: form-data; name="doc"; filename="test.pdf" 114 // Content-Type: application/pdf 115 // Use a case-insensitive map for simplified lookups 116 Map<String, String> headers = splitHeaders(multipartStream.readHeaders()); 117 String contentDisposition = trimAggressivelyToNull(headers.get("Content-Disposition")); 118 Map<String, String> contentDispositionFields = Map.of(); 119 120 if (contentDisposition != null) 121 contentDispositionFields = new ParameterParser().parse(contentDisposition, ';'); 122 123 String name = trimAggressivelyToNull(contentDispositionFields.get("name")); 124 125 if (name == null) 126 continue; 127 128 ByteArrayOutputStream data = new ByteArrayOutputStream(); 129 multipartStream.readBodyData(data); 130 131 String filename = trimAggressivelyToNull(contentDispositionFields.get("filename")); 132 133 // For example: 134 // "Screenshot-1.53.26 PM.png" 135 // becomes 136 // "Screenshot-1.53.26 PM.png" 137 if (filename != null) 138 filename = HTMLUtilities.unescapeHtml(filename); 139 140 String contentTypeHeaderValue = trimAggressivelyToNull(headers.get("Content-Type")); 141 String contentType = Utilities.extractContentTypeFromHeaderValue(contentTypeHeaderValue).orElse(null); 142 Charset charset = Utilities.extractCharsetFromHeaderValue(contentTypeHeaderValue).orElse(null); 143 144 MultipartField multipartField = MultipartField.with(name, data.toByteArray()) 145 .filename(filename) 146 .contentType(contentType) 147 .charset(charset) 148 .build(); 149 150 Set<MultipartField> multipartFields = multipartFieldsByName.get(name); 151 152 if (multipartFields == null) { 153 multipartFields = new LinkedHashSet<>(); 154 multipartFieldsByName.put(name, multipartFields); 155 } 156 157 multipartFields.add(multipartField); 158 159 hasNext = multipartStream.readBoundary(); 160 } 161 } catch (IOException e) { 162 throw new UncheckedIOException(e); 163 } 164 165 return multipartFieldsByName; 166 } 167 168 // The code below is sourced from Selenium. 169 // It is licensed under the terms of the Apache License, Version 2.0. 170 // The license text for all of the below code is as follows: 171 172 /* 173 Copyright 2012 Selenium committers 174 Copyright 2012 Software Freedom Conservancy 175 176 Licensed under the Apache License, Version 2.0 (the "License"); 177 you may not use this file except in compliance with the License. 178 You may obtain a copy of the License at 179 180 http://www.apache.org/licenses/LICENSE-2.0 181 182 Unless required by applicable law or agreed to in writing, software 183 distributed under the License is distributed on an "AS IS" BASIS, 184 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 185 See the License for the specific language governing permissions and 186 limitations under the License. 187 */ 188 189 // *** START Selenium UploadFileHandler source *** 190 191 protected LinkedCaseInsensitiveMap<String> splitHeaders(String readHeaders) { 192 LinkedCaseInsensitiveMap<String> headersBuilder = new LinkedCaseInsensitiveMap<>(); 193 String[] headers = readHeaders.split("\r\n"); 194 for (String headerLine : headers) { 195 int index = headerLine.indexOf(':'); 196 if (index < 0) { 197 continue; 198 } 199 String key = headerLine.substring(0, index); 200 String value = headerLine.substring(index + 1).trim(); 201 headersBuilder.put(key, value); 202 } 203 return headersBuilder; 204 } 205 206 protected LinkedCaseInsensitiveMap<String> extractFields(String contentTypeHeader) { 207 LinkedCaseInsensitiveMap<String> fieldsBuilder = new LinkedCaseInsensitiveMap<>(); 208 String[] contentTypeHeaderParts = contentTypeHeader.split("[;,]"); 209 for (String contentTypeHeaderPart : contentTypeHeaderParts) { 210 String[] kv = contentTypeHeaderPart.split("="); 211 if (kv.length == 2) { 212 fieldsBuilder.put(kv[0].trim().toLowerCase(Locale.US), kv[1].trim()); 213 } 214 } 215 return fieldsBuilder; 216 } 217 218 // *** END Selenium UploadFileHandler source *** 219 220 // The code below is sourced from the Apache Tomcat fork of Apache commons-fileupload. 221 // See https://github.com/apache/tomcat for the original. 222 // It is licensed under the terms of the Apache License, Version 2.0. 223 // The license text for all of the below code is as follows: 224 225 /* 226 * Licensed to the Apache Software Foundation (ASF) under one or more 227 * contributor license agreements. See the NOTICE file distributed with 228 * this work for additional information regarding copyright ownership. 229 * The ASF licenses this file to You under the Apache License, Version 2.0 230 * (the "License"); you may not use this file except in compliance with 231 * the License. You may obtain a copy of the License at 232 * 233 * http://www.apache.org/licenses/LICENSE-2.0 234 * 235 * Unless required by applicable law or agreed to in writing, software 236 * distributed under the License is distributed on an "AS IS" BASIS, 237 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 238 * See the License for the specific language governing permissions and 239 * limitations under the License. 240 */ 241 242 // *** START commons-fileupload source *** 243 244 /** 245 * Receives progress information. May be used to display a progress bar. 246 */ 247 @FunctionalInterface 248 protected interface ProgressListener { 249 250 /** 251 * Nop implementation. 252 */ 253 ProgressListener NOP = (bytesRead, contentLength, items) -> { 254 // nop 255 }; 256 257 /** 258 * Updates the listeners status information. 259 * 260 * @param bytesRead The total number of bytes, which have been read so far. 261 * @param contentLength The total number of bytes, which are being read. May be -1, if this number is unknown. 262 * @param items The number of the field, which is currently being read. (0 = no item so far, 1 = first item is being read, ...) 263 */ 264 void update(long bytesRead, long contentLength, int items); 265 266 } 267 268 /** 269 * Exception for errors encountered while processing the request. 270 */ 271 protected static class FileUploadException extends IOException { 272 273 private static final long serialVersionUID = -4222909057964038517L; 274 275 /** 276 * Constructs a new {@code FileUploadException} without message. 277 */ 278 public FileUploadException() { 279 super(); 280 } 281 282 /** 283 * Constructs a new {@code FileUploadException} with specified detail 284 * message. 285 * 286 * @param msg the error message. 287 */ 288 public FileUploadException(final String msg) { 289 super(msg); 290 } 291 292 /** 293 * Creates a new {@code FileUploadException} with the given 294 * detail message and cause. 295 * 296 * @param msg The exceptions detail message. 297 * @param cause The exceptions cause. 298 */ 299 public FileUploadException(final String msg, final Throwable cause) { 300 super(msg, cause); 301 } 302 } 303 304 /** 305 * This exception is thrown for hiding an inner 306 * {@link FileUploadException} in an {@link IOException}. 307 */ 308 protected static class FileUploadIOException extends IOException { 309 310 /** 311 * The exceptions UID, for serializing an instance. 312 */ 313 private static final long serialVersionUID = -7047616958165584154L; 314 315 /** 316 * The exceptions cause; we overwrite the parent 317 * classes field, which is available since Java 318 * 1.4 only. 319 */ 320 private final FileUploadException cause; 321 322 /** 323 * Creates a {@code FileUploadIOException} with the 324 * given cause. 325 * 326 * @param pCause The exceptions cause, if any, or null. 327 */ 328 public FileUploadIOException(final FileUploadException pCause) { 329 // We're not doing super(pCause) cause of 1.3 compatibility. 330 cause = pCause; 331 } 332 333 /** 334 * Returns the exceptions cause. 335 * 336 * @return The exceptions cause, if any, or null. 337 */ 338 @SuppressWarnings("sync-override") // Field is final 339 @Override 340 public Throwable getCause() { 341 return cause; 342 } 343 344 } 345 346 /** 347 * <p> This class provides support for accessing the headers for a file or form 348 * item that was received within a {@code multipart/form-data} POST 349 * request.</p> 350 * 351 * @since 1.2.1 352 */ 353 protected interface FileItemHeaders { 354 355 /** 356 * Returns the value of the specified part header as a {@code String}. 357 * <p> 358 * If the part did not include a header of the specified name, this method 359 * return {@code null}. If there are multiple headers with the same 360 * name, this method returns the first header in the item. The header 361 * name is case insensitive. 362 * 363 * @param name a {@code String} specifying the header name 364 * @return a {@code String} containing the value of the requested 365 * header, or {@code null} if the item does not have a header 366 * of that name 367 */ 368 String getHeader(String name); 369 370 /** 371 * <p> 372 * Returns all the values of the specified item header as an 373 * {@code Iterator} of {@code String} objects. 374 * </p> 375 * <p> 376 * If the item did not include any headers of the specified name, this 377 * method returns an empty {@code Iterator}. The header name is 378 * case insensitive. 379 * </p> 380 * 381 * @param name a {@code String} specifying the header name 382 * @return an {@code Iterator} containing the values of the 383 * requested header. If the item does not have any headers of 384 * that name, return an empty {@code Iterator} 385 */ 386 Iterator<String> getHeaders(String name); 387 388 /** 389 * <p> 390 * Returns an {@code Iterator} of all the header names. 391 * </p> 392 * 393 * @return an {@code Iterator} containing all of the names of 394 * headers provided with this file item. If the item does not have 395 * any headers return an empty {@code Iterator} 396 */ 397 Iterator<String> getHeaderNames(); 398 399 } 400 401 /** 402 * Interface that will indicate that FileItem or FileItemStream 403 * implementations will accept the headers read for the item. 404 * 405 * @see FileItemStream 406 * @since 1.2.1 407 */ 408 protected interface FileItemHeadersSupport { 409 410 /** 411 * Returns the collection of headers defined locally within this item. 412 * 413 * @return the {@link FileItemHeaders} present for this item. 414 */ 415 FileItemHeaders getHeaders(); 416 417 /** 418 * Sets the headers read from within an item. Implementations of 419 * FileItem or FileItemStream should implement this 420 * interface to be able to get the raw headers found within the item 421 * header block. 422 * 423 * @param headers the instance that holds onto the headers 424 * for this instance. 425 */ 426 void setHeaders(FileItemHeaders headers); 427 428 } 429 430 /** 431 * <p> This interface provides access to a file or form item that was 432 * received within a {@code multipart/form-data} POST request. 433 * The items contents are retrieved by calling {@link #openStream()}.</p> 434 * <p>Instances of this class are created by accessing the 435 * iterator, returned by 436 * FileUploadBase#getItemIterator(RequestContext).</p> 437 * <p><em>Note</em>: There is an interaction between the iterator and 438 * its associated instances of {@link FileItemStream}: By invoking 439 * {@link java.util.Iterator#hasNext()} on the iterator, you discard all data, 440 * which hasn't been read so far from the previous data.</p> 441 */ 442 protected interface FileItemStream extends FileItemHeadersSupport { 443 444 /** 445 * This exception is thrown, if an attempt is made to read 446 * data from the {@link InputStream}, which has been returned 447 * by {@link FileItemStream#openStream()}, after 448 * {@link java.util.Iterator#hasNext()} has been invoked on the 449 * iterator, which created the {@link FileItemStream}. 450 */ 451 class ItemSkippedException extends IOException { 452 453 /** 454 * The exceptions serial version UID, which is being used 455 * when serializing an exception instance. 456 */ 457 private static final long serialVersionUID = -7280778431581963740L; 458 459 } 460 461 /** 462 * Creates an {@link InputStream}, which allows to read the 463 * items contents. 464 * 465 * @return The input stream, from which the items data may 466 * be read. 467 * @throws IllegalStateException The method was already invoked on 468 * this item. It is not possible to recreate the data stream. 469 * @throws IOException An I/O error occurred. 470 * @see ItemSkippedException 471 */ 472 InputStream openStream() throws IOException; 473 474 /** 475 * Returns the content type passed by the browser or {@code null} if 476 * not defined. 477 * 478 * @return The content type passed by the browser or {@code null} if 479 * not defined. 480 */ 481 String getContentType(); 482 483 /** 484 * Returns the original file name in the client's file system, as provided by 485 * the browser (or other client software). In most cases, this will be the 486 * base file name, without path information. However, some clients, such as 487 * the Opera browser, do include path information. 488 * 489 * @return The original file name in the client's file system. 490 */ 491 String getName(); 492 493 /** 494 * Returns the name of the field in the multipart form corresponding to 495 * this file item. 496 * 497 * @return The name of the form field. 498 */ 499 String getFieldName(); 500 501 /** 502 * Determines whether or not a {@code FileItem} instance represents 503 * a simple form field. 504 * 505 * @return {@code true} if the instance represents a simple form 506 * field; {@code false} if it represents an uploaded file. 507 */ 508 boolean isFormField(); 509 510 } 511 512 /** 513 * This exception is thrown in case of an invalid file name. 514 * A file name is invalid, if it contains a NUL character. 515 * Attackers might use this to circumvent security checks: 516 * For example, a malicious user might upload a file with the name 517 * "foo.exe\0.png". This file name might pass security checks (i.e. 518 * checks for the extension ".png"), while, depending on the underlying 519 * C library, it might create a file named "foo.exe", as the NUL 520 * character is the string terminator in C. 521 */ 522 protected static class InvalidFileNameException extends RuntimeException { 523 524 /** 525 * Serial version UID, being used, if the exception 526 * is serialized. 527 */ 528 private static final long serialVersionUID = 7922042602454350470L; 529 530 /** 531 * The file name causing the exception. 532 */ 533 private final String name; 534 535 /** 536 * Creates a new instance. 537 * 538 * @param pName The file name causing the exception. 539 * @param pMessage A human readable error message. 540 */ 541 public InvalidFileNameException(final String pName, final String pMessage) { 542 super(pMessage); 543 name = pName; 544 } 545 546 /** 547 * Returns the invalid file name. 548 * 549 * @return the invalid file name. 550 */ 551 public String getName() { 552 return name; 553 } 554 555 } 556 557 /** 558 * Utility class for working with streams. 559 */ 560 protected static final class Streams { 561 562 /** 563 * Private constructor, to prevent instantiation. 564 * This class has only static methods. 565 */ 566 private Streams() { 567 // Does nothing 568 } 569 570 /** 571 * Default buffer size for use in 572 * {@link #copy(InputStream, OutputStream, boolean)}. 573 */ 574 public static final int DEFAULT_BUFFER_SIZE = 8192; 575 576 /** 577 * Copies the contents of the given {@link InputStream} 578 * to the given {@link OutputStream}. Shortcut for 579 * <pre> 580 * copy(pInputStream, pOutputStream, new byte[8192]); 581 * </pre> 582 * 583 * @param inputStream The input stream, which is being read. 584 * It is guaranteed, that {@link InputStream#close()} is called 585 * on the stream. 586 * @param outputStream The output stream, to which data should 587 * be written. May be null, in which case the input streams 588 * contents are simply discarded. 589 * @param closeOutputStream True guarantees, that {@link OutputStream#close()} 590 * is called on the stream. False indicates, that only 591 * {@link OutputStream#flush()} should be called finally. 592 * @return Number of bytes, which have been copied. 593 * @throws IOException An I/O error occurred. 594 */ 595 public static long copy(final InputStream inputStream, final OutputStream outputStream, 596 final boolean closeOutputStream) 597 throws IOException { 598 return copy(inputStream, outputStream, closeOutputStream, new byte[DEFAULT_BUFFER_SIZE]); 599 } 600 601 /** 602 * Copies the contents of the given {@link InputStream} 603 * to the given {@link OutputStream}. 604 * 605 * @param inputStream The input stream, which is being read. 606 * It is guaranteed, that {@link InputStream#close()} is called 607 * on the stream. 608 * @param outputStream The output stream, to which data should 609 * be written. May be null, in which case the input streams 610 * contents are simply discarded. 611 * @param closeOutputStream True guarantees, that {@link OutputStream#close()} 612 * is called on the stream. False indicates, that only 613 * {@link OutputStream#flush()} should be called finally. 614 * @param buffer Temporary buffer, which is to be used for 615 * copying data. 616 * @return Number of bytes, which have been copied. 617 * @throws IOException An I/O error occurred. 618 */ 619 public static long copy(final InputStream inputStream, 620 final OutputStream outputStream, final boolean closeOutputStream, 621 final byte[] buffer) 622 throws IOException { 623 try (OutputStream out = outputStream; 624 InputStream in = inputStream) { 625 long total = 0; 626 for (; ; ) { 627 final int res = in.read(buffer); 628 if (res == -1) { 629 break; 630 } 631 if (res > 0) { 632 total += res; 633 if (out != null) { 634 out.write(buffer, 0, res); 635 } 636 } 637 } 638 if (out != null) { 639 if (closeOutputStream) { 640 out.close(); 641 } else { 642 out.flush(); 643 } 644 } 645 in.close(); 646 return total; 647 } 648 } 649 650 /** 651 * Checks, whether the given file name is valid in the sense, 652 * that it doesn't contain any NUL characters. If the file name 653 * is valid, it will be returned without any modifications. Otherwise, 654 * an {@link InvalidFileNameException} is raised. 655 * 656 * @param fileName The file name to check 657 * @return Unmodified file name, if valid. 658 * @throws InvalidFileNameException The file name was found to be invalid. 659 */ 660 public static String checkFileName(final String fileName) { 661 if (fileName != null && fileName.indexOf('\u0000') != -1) { 662 // pFileName.replace("\u0000", "\\0") 663 final StringBuilder sb = new StringBuilder(); 664 for (int i = 0; i < fileName.length(); i++) { 665 final char c = fileName.charAt(i); 666 switch (c) { 667 case 0: 668 sb.append("\\0"); 669 break; 670 default: 671 sb.append(c); 672 break; 673 } 674 } 675 throw new InvalidFileNameException(fileName, 676 "Invalid file name: " + sb); 677 } 678 return fileName; 679 } 680 681 } 682 683 /** 684 * <p> Low level API for processing file uploads. 685 * 686 * <p> This class can be used to process data streams conforming to MIME 687 * 'multipart' format as defined in 688 * <a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a>. Arbitrarily 689 * large amounts of data in the stream can be processed under constant 690 * memory usage. 691 * 692 * <p> The format of the stream is defined in the following way:<br> 693 * 694 * <code> 695 * multipart-body := preamble 1*encapsulation close-delimiter epilogue<br> 696 * encapsulation := delimiter body CRLF<br> 697 * delimiter := "--" boundary CRLF<br> 698 * close-delimiter := "--" boundary "--"<br> 699 * preamble := <ignore><br> 700 * epilogue := <ignore><br> 701 * body := header-part CRLF body-part<br> 702 * header-part := 1*header CRLF<br> 703 * header := header-name ":" header-value<br> 704 * header-name := <printable ascii characters except ":"><br> 705 * header-value := <any ascii characters except CR & LF><br> 706 * body-data := <arbitrary data><br> 707 * </code> 708 * 709 * <p>Note that body-data can contain another mulipart entity. There 710 * is limited support for single pass processing of such nested 711 * streams. The nested stream is <strong>required</strong> to have a 712 * boundary token of the same length as the parent stream (see {@link 713 * #setBoundary(byte[])}). 714 * 715 * <p>Here is an example of usage of this class.<br> 716 * 717 * <pre> 718 * try { 719 * MultipartStream multipartStream = new MultipartStream(input, boundary); 720 * boolean nextPart = multipartStream.skipPreamble(); 721 * OutputStream output; 722 * while(nextPart) { 723 * String header = multipartStream.readHeaders(); 724 * // process headers 725 * // create some output stream 726 * multipartStream.readBodyData(output); 727 * nextPart = multipartStream.readBoundary(); 728 * } 729 * } catch(MultipartStream.MalformedStreamException e) { 730 * // the stream failed to follow required syntax 731 * } catch(IOException e) { 732 * // a read or write error occurred 733 * } 734 * </pre> 735 */ 736 protected static class MultipartStream { 737 /** 738 * Internal class, which is used to invoke the 739 * {@link ProgressListener}. 740 */ 741 public static class ProgressNotifier { 742 743 /** 744 * The listener to invoke. 745 */ 746 private final ProgressListener listener; 747 748 /** 749 * Number of expected bytes, if known, or -1. 750 */ 751 private final long contentLength; 752 753 /** 754 * Number of bytes, which have been read so far. 755 */ 756 private long bytesRead; 757 758 /** 759 * Number of items, which have been read so far. 760 */ 761 private int items; 762 763 /** 764 * Creates a new instance with the given listener 765 * and content length. 766 * 767 * @param pListener The listener to invoke. 768 * @param pContentLength The expected content length. 769 */ 770 public ProgressNotifier(final ProgressListener pListener, final long pContentLength) { 771 listener = pListener; 772 contentLength = pContentLength; 773 } 774 775 /** 776 * Called to indicate that bytes have been read. 777 * 778 * @param pBytes Number of bytes, which have been read. 779 */ 780 void noteBytesRead(final int pBytes) { 781 /* Indicates, that the given number of bytes have been read from 782 * the input stream. 783 */ 784 bytesRead += pBytes; 785 notifyListener(); 786 } 787 788 /** 789 * Called to indicate, that a new file item has been detected. 790 */ 791 public void noteItem() { 792 ++items; 793 notifyListener(); 794 } 795 796 /** 797 * Called for notifying the listener. 798 */ 799 private void notifyListener() { 800 if (listener != null) { 801 listener.update(bytesRead, contentLength, items); 802 } 803 } 804 805 } 806 807 // ----------------------------------------------------- Manifest constants 808 809 /** 810 * The Carriage Return ASCII character value. 811 */ 812 public static final byte CR = 0x0D; 813 814 /** 815 * The Line Feed ASCII character value. 816 */ 817 public static final byte LF = 0x0A; 818 819 /** 820 * The dash (-) ASCII character value. 821 */ 822 public static final byte DASH = 0x2D; 823 824 /** 825 * The maximum length of {@code header-part} that will be 826 * processed (10 kilobytes = 10240 bytes.). 827 */ 828 public static final int HEADER_PART_SIZE_MAX = 10240; 829 830 /** 831 * The default length of the buffer used for processing a request. 832 */ 833 protected static final int DEFAULT_BUFSIZE = 4096; 834 835 /** 836 * A byte sequence that marks the end of {@code header-part} 837 * ({@code CRLFCRLF}). 838 */ 839 protected static final byte[] HEADER_SEPARATOR = {CR, LF, CR, LF}; 840 841 /** 842 * A byte sequence that that follows a delimiter that will be 843 * followed by an encapsulation ({@code CRLF}). 844 */ 845 protected static final byte[] FIELD_SEPARATOR = {CR, LF}; 846 847 /** 848 * A byte sequence that that follows a delimiter of the last 849 * encapsulation in the stream ({@code --}). 850 */ 851 protected static final byte[] STREAM_TERMINATOR = {DASH, DASH}; 852 853 /** 854 * A byte sequence that precedes a boundary ({@code CRLF--}). 855 */ 856 protected static final byte[] BOUNDARY_PREFIX = {CR, LF, DASH, DASH}; 857 858 // ----------------------------------------------------------- Data members 859 860 /** 861 * The input stream from which data is read. 862 */ 863 private final InputStream input; 864 865 /** 866 * The length of the boundary token plus the leading {@code CRLF--}. 867 */ 868 private int boundaryLength; 869 870 /** 871 * The amount of data, in bytes, that must be kept in the buffer in order 872 * to detect delimiters reliably. 873 */ 874 private final int keepRegion; 875 876 /** 877 * The byte sequence that partitions the stream. 878 */ 879 private final byte[] boundary; 880 881 /** 882 * The table for Knuth-Morris-Pratt search algorithm. 883 */ 884 private final int[] boundaryTable; 885 886 /** 887 * The length of the buffer used for processing the request. 888 */ 889 private final int bufSize; 890 891 /** 892 * The buffer used for processing the request. 893 */ 894 private final byte[] buffer; 895 896 /** 897 * The index of first valid character in the buffer. 898 * <br> 899 * 0 <= head < bufSize 900 */ 901 private int head; 902 903 /** 904 * The index of last valid character in the buffer + 1. 905 * <br> 906 * 0 <= tail <= bufSize 907 */ 908 private int tail; 909 910 /** 911 * The content encoding to use when reading headers. 912 */ 913 private String headerEncoding; 914 915 /** 916 * The progress notifier, if any, or null. 917 */ 918 private final ProgressNotifier notifier; 919 920 // ----------------------------------------------------------- Constructors 921 922 /** 923 * <p> Constructs a {@code MultipartStream} with a custom size buffer. 924 * 925 * <p> Note that the buffer must be at least big enough to contain the 926 * boundary string, plus 4 characters for CR/LF and double dash, plus at 927 * least one byte of data. Too small a buffer size setting will degrade 928 * performance. 929 * 930 * @param input The {@code InputStream} to serve as a data source. 931 * @param boundary The token used for dividing the stream into 932 * {@code encapsulations}. 933 * @param bufSize The size of the buffer to be used, in bytes. 934 * @param pNotifier The notifier, which is used for calling the 935 * progress listener, if any. 936 * @throws IllegalArgumentException If the buffer size is too small 937 * @since 1.3.1 938 */ 939 public MultipartStream(final InputStream input, 940 final byte[] boundary, 941 final int bufSize, 942 final ProgressNotifier pNotifier) { 943 944 if (boundary == null) { 945 throw new IllegalArgumentException("boundary may not be null"); 946 } 947 // We prepend CR/LF to the boundary to chop trailing CR/LF from 948 // body-data tokens. 949 this.boundaryLength = boundary.length + BOUNDARY_PREFIX.length; 950 if (bufSize < this.boundaryLength + 1) { 951 throw new IllegalArgumentException( 952 "The buffer size specified for the MultipartStream is too small"); 953 } 954 955 this.input = input; 956 this.bufSize = Math.max(bufSize, boundaryLength * 2); 957 this.buffer = new byte[this.bufSize]; 958 this.notifier = pNotifier; 959 960 this.boundary = new byte[this.boundaryLength]; 961 this.boundaryTable = new int[this.boundaryLength + 1]; 962 this.keepRegion = this.boundary.length; 963 964 System.arraycopy(BOUNDARY_PREFIX, 0, this.boundary, 0, 965 BOUNDARY_PREFIX.length); 966 System.arraycopy(boundary, 0, this.boundary, BOUNDARY_PREFIX.length, 967 boundary.length); 968 computeBoundaryTable(); 969 970 head = 0; 971 tail = 0; 972 } 973 974 /** 975 * <p> Constructs a {@code MultipartStream} with a default size buffer. 976 * 977 * @param input The {@code InputStream} to serve as a data source. 978 * @param boundary The token used for dividing the stream into 979 * {@code encapsulations}. 980 * @param pNotifier An object for calling the progress listener, if any. 981 * @see #MultipartStream(InputStream, byte[], int, ProgressNotifier) 982 */ 983 public MultipartStream(final InputStream input, 984 final byte[] boundary, 985 final ProgressNotifier pNotifier) { 986 this(input, boundary, DEFAULT_BUFSIZE, pNotifier); 987 } 988 989 // --------------------------------------------------------- Public methods 990 991 /** 992 * Retrieves the character encoding used when reading the headers of an 993 * individual part. When not specified, or {@code null}, the platform 994 * default encoding is used. 995 * 996 * @return The encoding used to read part headers. 997 */ 998 public String getHeaderEncoding() { 999 return headerEncoding; 1000 } 1001 1002 /** 1003 * Specifies the character encoding to be used when reading the headers of 1004 * individual parts. When not specified, or {@code null}, the platform 1005 * default encoding is used. 1006 * 1007 * @param encoding The encoding used to read part headers. 1008 */ 1009 public void setHeaderEncoding(final String encoding) { 1010 headerEncoding = encoding; 1011 } 1012 1013 /** 1014 * Reads a byte from the {@code buffer}, and refills it as 1015 * necessary. 1016 * 1017 * @return The next byte from the input stream. 1018 * @throws IOException if there is no more data available. 1019 */ 1020 public byte readByte() throws IOException { 1021 // Buffer depleted ? 1022 if (head == tail) { 1023 head = 0; 1024 // Refill. 1025 tail = input.read(buffer, head, bufSize); 1026 if (tail == -1) { 1027 // No more data available. 1028 throw new IOException("No more data is available"); 1029 } 1030 if (notifier != null) { 1031 notifier.noteBytesRead(tail); 1032 } 1033 } 1034 return buffer[head++]; 1035 } 1036 1037 /** 1038 * Skips a {@code boundary} token, and checks whether more 1039 * {@code encapsulations} are contained in the stream. 1040 * 1041 * @return {@code true} if there are more encapsulations in 1042 * this stream; {@code false} otherwise. 1043 * @throws FileUploadIOException if the bytes read from the stream exceeded the size limits 1044 * @throws MalformedStreamException if the stream ends unexpectedly or 1045 * fails to follow required syntax. 1046 */ 1047 public boolean readBoundary() 1048 throws FileUploadIOException, MalformedStreamException { 1049 final byte[] marker = new byte[2]; 1050 final boolean nextChunk; 1051 1052 head += boundaryLength; 1053 try { 1054 marker[0] = readByte(); 1055 if (marker[0] == LF) { 1056 // Work around IE5 Mac bug with input type=image. 1057 // Because the boundary delimiter, not including the trailing 1058 // CRLF, must not appear within any file (RFC 2046, section 1059 // 5.1.1), we know the missing CR is due to a buggy browser 1060 // rather than a file containing something similar to a 1061 // boundary. 1062 return true; 1063 } 1064 1065 marker[1] = readByte(); 1066 if (arrayequals(marker, STREAM_TERMINATOR, 2)) { 1067 nextChunk = false; 1068 } else if (arrayequals(marker, FIELD_SEPARATOR, 2)) { 1069 nextChunk = true; 1070 } else { 1071 throw new MalformedStreamException( 1072 "Unexpected characters follow a boundary"); 1073 } 1074 } catch (final FileUploadIOException e) { 1075 // wraps a SizeException, re-throw as it will be unwrapped later 1076 throw e; 1077 } catch (final IOException e) { 1078 throw new MalformedStreamException("Stream ended unexpectedly"); 1079 } 1080 return nextChunk; 1081 } 1082 1083 /** 1084 * <p>Changes the boundary token used for partitioning the stream. 1085 * 1086 * <p>This method allows single pass processing of nested multipart 1087 * streams. 1088 * 1089 * <p>The boundary token of the nested stream is {@code required} 1090 * to be of the same length as the boundary token in parent stream. 1091 * 1092 * <p>Restoring the parent stream boundary token after processing of a 1093 * nested stream is left to the application. 1094 * 1095 * @param boundary The boundary to be used for parsing of the nested 1096 * stream. 1097 * @throws IllegalBoundaryException if the {@code boundary} 1098 * has a different length than the one 1099 * being currently parsed. 1100 */ 1101 public void setBoundary(final byte[] boundary) 1102 throws IllegalBoundaryException { 1103 if (boundary.length != boundaryLength - BOUNDARY_PREFIX.length) { 1104 throw new IllegalBoundaryException( 1105 "The length of a boundary token cannot be changed"); 1106 } 1107 System.arraycopy(boundary, 0, this.boundary, BOUNDARY_PREFIX.length, 1108 boundary.length); 1109 computeBoundaryTable(); 1110 } 1111 1112 /** 1113 * Compute the table used for Knuth-Morris-Pratt search algorithm. 1114 */ 1115 private void computeBoundaryTable() { 1116 int position = 2; 1117 int candidate = 0; 1118 1119 boundaryTable[0] = -1; 1120 boundaryTable[1] = 0; 1121 1122 while (position <= boundaryLength) { 1123 if (boundary[position - 1] == boundary[candidate]) { 1124 boundaryTable[position] = candidate + 1; 1125 candidate++; 1126 position++; 1127 } else if (candidate > 0) { 1128 candidate = boundaryTable[candidate]; 1129 } else { 1130 boundaryTable[position] = 0; 1131 position++; 1132 } 1133 } 1134 } 1135 1136 /** 1137 * <p>Reads the {@code header-part} of the current 1138 * {@code encapsulation}. 1139 * 1140 * <p>Headers are returned verbatim to the input stream, including the 1141 * trailing {@code CRLF} marker. Parsing is left to the 1142 * application. 1143 * 1144 * <p><strong>TODO</strong> allow limiting maximum header size to 1145 * protect against abuse. 1146 * 1147 * @return The {@code header-part} of the current encapsulation. 1148 * @throws FileUploadIOException if the bytes read from the stream exceeded the size limits. 1149 * @throws MalformedStreamException if the stream ends unexpectedly. 1150 */ 1151 public String readHeaders() throws FileUploadIOException, MalformedStreamException { 1152 int i = 0; 1153 byte b; 1154 // to support multi-byte characters 1155 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 1156 int size = 0; 1157 while (i < HEADER_SEPARATOR.length) { 1158 try { 1159 b = readByte(); 1160 } catch (final FileUploadIOException e) { 1161 // wraps a SizeException, re-throw as it will be unwrapped later 1162 throw e; 1163 } catch (final IOException e) { 1164 throw new MalformedStreamException("Stream ended unexpectedly"); 1165 } 1166 if (++size > HEADER_PART_SIZE_MAX) { 1167 throw new MalformedStreamException(String.format( 1168 "Header section has more than %s bytes (maybe it is not properly terminated)", 1169 Integer.valueOf(HEADER_PART_SIZE_MAX))); 1170 } 1171 if (b == HEADER_SEPARATOR[i]) { 1172 i++; 1173 } else { 1174 i = 0; 1175 } 1176 baos.write(b); 1177 } 1178 1179 String headers; 1180 if (headerEncoding != null) { 1181 try { 1182 headers = baos.toString(headerEncoding); 1183 } catch (final UnsupportedEncodingException e) { 1184 // Fall back to platform default if specified encoding is not 1185 // supported. 1186 headers = baos.toString(); 1187 } 1188 } else { 1189 headers = baos.toString(); 1190 } 1191 1192 return headers; 1193 } 1194 1195 /** 1196 * <p>Reads {@code body-data} from the current 1197 * {@code encapsulation} and writes its contents into the 1198 * output {@code Stream}. 1199 * 1200 * <p>Arbitrary large amounts of data can be processed by this 1201 * method using a constant size buffer. (see {@link 1202 * #MultipartStream(InputStream, byte[], int, 1203 * MultipartStream.ProgressNotifier) constructor}). 1204 * 1205 * @param output The {@code Stream} to write data into. May 1206 * be null, in which case this method is equivalent 1207 * to {@link #discardBodyData()}. 1208 * @return the amount of data written. 1209 * @throws MalformedStreamException if the stream ends unexpectedly. 1210 * @throws IOException if an i/o error occurs. 1211 */ 1212 public int readBodyData(final OutputStream output) 1213 throws MalformedStreamException, IOException { 1214 return (int) Streams.copy(newInputStream(), output, false); // N.B. Streams.copy closes the input stream 1215 } 1216 1217 /** 1218 * Creates a new {@link ItemInputStream}. 1219 * 1220 * @return A new instance of {@link ItemInputStream}. 1221 */ 1222 public ItemInputStream newInputStream() { 1223 return new ItemInputStream(); 1224 } 1225 1226 /** 1227 * <p> Reads {@code body-data} from the current 1228 * {@code encapsulation} and discards it. 1229 * 1230 * <p>Use this method to skip encapsulations you don't need or don't 1231 * understand. 1232 * 1233 * @return The amount of data discarded. 1234 * @throws MalformedStreamException if the stream ends unexpectedly. 1235 * @throws IOException if an i/o error occurs. 1236 */ 1237 public int discardBodyData() throws MalformedStreamException, IOException { 1238 return readBodyData(null); 1239 } 1240 1241 /** 1242 * Finds the beginning of the first {@code encapsulation}. 1243 * 1244 * @return {@code true} if an {@code encapsulation} was found in 1245 * the stream. 1246 * @throws IOException if an i/o error occurs. 1247 */ 1248 public boolean skipPreamble() throws IOException { 1249 // First delimiter may be not preceded with a CRLF. 1250 System.arraycopy(boundary, 2, boundary, 0, boundary.length - 2); 1251 boundaryLength = boundary.length - 2; 1252 computeBoundaryTable(); 1253 try { 1254 // Discard all data up to the delimiter. 1255 discardBodyData(); 1256 1257 // Read boundary - if succeeded, the stream contains an 1258 // encapsulation. 1259 return readBoundary(); 1260 } catch (final MalformedStreamException e) { 1261 return false; 1262 } finally { 1263 // Restore delimiter. 1264 System.arraycopy(boundary, 0, boundary, 2, boundary.length - 2); 1265 boundaryLength = boundary.length; 1266 boundary[0] = CR; 1267 boundary[1] = LF; 1268 computeBoundaryTable(); 1269 } 1270 } 1271 1272 /** 1273 * Compares {@code count} first bytes in the arrays 1274 * {@code a} and {@code b}. 1275 * 1276 * @param a The first array to compare. 1277 * @param b The second array to compare. 1278 * @param count How many bytes should be compared. 1279 * @return {@code true} if {@code count} first bytes in arrays 1280 * {@code a} and {@code b} are equal. 1281 */ 1282 public static boolean arrayequals(final byte[] a, 1283 final byte[] b, 1284 final int count) { 1285 for (int i = 0; i < count; i++) { 1286 if (a[i] != b[i]) { 1287 return false; 1288 } 1289 } 1290 return true; 1291 } 1292 1293 /** 1294 * Searches for the {@code boundary} in the {@code buffer} 1295 * region delimited by {@code head} and {@code tail}. 1296 * 1297 * @return The position of the boundary found, counting from the 1298 * beginning of the {@code buffer}, or {@code -1} if 1299 * not found. 1300 */ 1301 protected int findSeparator() { 1302 1303 int bufferPos = this.head; 1304 int tablePos = 0; 1305 1306 while (bufferPos < this.tail) { 1307 while (tablePos >= 0 && buffer[bufferPos] != boundary[tablePos]) { 1308 tablePos = boundaryTable[tablePos]; 1309 } 1310 bufferPos++; 1311 tablePos++; 1312 if (tablePos == boundaryLength) { 1313 return bufferPos - boundaryLength; 1314 } 1315 } 1316 return -1; 1317 } 1318 1319 /** 1320 * Thrown to indicate that the input stream fails to follow the 1321 * required syntax. 1322 */ 1323 public static class MalformedStreamException extends IOException { 1324 1325 /** 1326 * The UID to use when serializing this instance. 1327 */ 1328 private static final long serialVersionUID = 6466926458059796677L; 1329 1330 /** 1331 * Constructs a {@code MalformedStreamException} with no 1332 * detail message. 1333 */ 1334 public MalformedStreamException() { 1335 } 1336 1337 /** 1338 * Constructs an {@code MalformedStreamException} with 1339 * the specified detail message. 1340 * 1341 * @param message The detail message. 1342 */ 1343 public MalformedStreamException(final String message) { 1344 super(message); 1345 } 1346 1347 } 1348 1349 /** 1350 * Thrown upon attempt of setting an invalid boundary token. 1351 */ 1352 public static class IllegalBoundaryException extends IOException { 1353 1354 /** 1355 * The UID to use when serializing this instance. 1356 */ 1357 private static final long serialVersionUID = -161533165102632918L; 1358 1359 /** 1360 * Constructs an {@code IllegalBoundaryException} with no 1361 * detail message. 1362 */ 1363 public IllegalBoundaryException() { 1364 } 1365 1366 /** 1367 * Constructs an {@code IllegalBoundaryException} with 1368 * the specified detail message. 1369 * 1370 * @param message The detail message. 1371 */ 1372 public IllegalBoundaryException(final String message) { 1373 super(message); 1374 } 1375 1376 } 1377 1378 /** 1379 * An {@link InputStream} for reading an items contents. 1380 */ 1381 public class ItemInputStream extends InputStream implements Closeable { 1382 1383 /** 1384 * The number of bytes, which have been read so far. 1385 */ 1386 private long total; 1387 1388 /** 1389 * The number of bytes, which must be hold, because 1390 * they might be a part of the boundary. 1391 */ 1392 private int pad; 1393 1394 /** 1395 * The current offset in the buffer. 1396 */ 1397 private int pos; 1398 1399 /** 1400 * Whether the stream is already closed. 1401 */ 1402 private boolean closed; 1403 1404 /** 1405 * Creates a new instance. 1406 */ 1407 ItemInputStream() { 1408 findSeparator(); 1409 } 1410 1411 /** 1412 * Called for finding the separator. 1413 */ 1414 private void findSeparator() { 1415 pos = MultipartStream.this.findSeparator(); 1416 if (pos == -1) { 1417 if (tail - head > keepRegion) { 1418 pad = keepRegion; 1419 } else { 1420 pad = tail - head; 1421 } 1422 } 1423 } 1424 1425 /** 1426 * Returns the number of bytes, which have been read 1427 * by the stream. 1428 * 1429 * @return Number of bytes, which have been read so far. 1430 */ 1431 public long getBytesRead() { 1432 return total; 1433 } 1434 1435 /** 1436 * Returns the number of bytes, which are currently 1437 * available, without blocking. 1438 * 1439 * @return Number of bytes in the buffer. 1440 * @throws IOException An I/O error occurs. 1441 */ 1442 @Override 1443 public int available() throws IOException { 1444 if (pos == -1) { 1445 return tail - head - pad; 1446 } 1447 return pos - head; 1448 } 1449 1450 /** 1451 * Offset when converting negative bytes to integers. 1452 */ 1453 private static final int BYTE_POSITIVE_OFFSET = 256; 1454 1455 /** 1456 * Returns the next byte in the stream. 1457 * 1458 * @return The next byte in the stream, as a non-negative 1459 * integer, or -1 for EOF. 1460 * @throws IOException An I/O error occurred. 1461 */ 1462 @Override 1463 public int read() throws IOException { 1464 if (closed) { 1465 throw new FileItemStream.ItemSkippedException(); 1466 } 1467 if (available() == 0 && makeAvailable() == 0) { 1468 return -1; 1469 } 1470 ++total; 1471 final int b = buffer[head++]; 1472 if (b >= 0) { 1473 return b; 1474 } 1475 return b + BYTE_POSITIVE_OFFSET; 1476 } 1477 1478 /** 1479 * Reads bytes into the given buffer. 1480 * 1481 * @param b The destination buffer, where to write to. 1482 * @param off Offset of the first byte in the buffer. 1483 * @param len Maximum number of bytes to read. 1484 * @return Number of bytes, which have been actually read, 1485 * or -1 for EOF. 1486 * @throws IOException An I/O error occurred. 1487 */ 1488 @Override 1489 public int read(final byte[] b, final int off, final int len) throws IOException { 1490 if (closed) { 1491 throw new FileItemStream.ItemSkippedException(); 1492 } 1493 if (len == 0) { 1494 return 0; 1495 } 1496 int res = available(); 1497 if (res == 0) { 1498 res = makeAvailable(); 1499 if (res == 0) { 1500 return -1; 1501 } 1502 } 1503 res = Math.min(res, len); 1504 System.arraycopy(buffer, head, b, off, res); 1505 head += res; 1506 total += res; 1507 return res; 1508 } 1509 1510 /** 1511 * Closes the input stream. 1512 * 1513 * @throws IOException An I/O error occurred. 1514 */ 1515 @Override 1516 public void close() throws IOException { 1517 close(false); 1518 } 1519 1520 /** 1521 * Closes the input stream. 1522 * 1523 * @param pCloseUnderlying Whether to close the underlying stream 1524 * (hard close) 1525 * @throws IOException An I/O error occurred. 1526 */ 1527 public void close(final boolean pCloseUnderlying) throws IOException { 1528 if (closed) { 1529 return; 1530 } 1531 if (pCloseUnderlying) { 1532 closed = true; 1533 input.close(); 1534 } else { 1535 for (; ; ) { 1536 int av = available(); 1537 if (av == 0) { 1538 av = makeAvailable(); 1539 if (av == 0) { 1540 break; 1541 } 1542 } 1543 skip(av); 1544 } 1545 } 1546 closed = true; 1547 } 1548 1549 /** 1550 * Skips the given number of bytes. 1551 * 1552 * @param bytes Number of bytes to skip. 1553 * @return The number of bytes, which have actually been 1554 * skipped. 1555 * @throws IOException An I/O error occurred. 1556 */ 1557 @Override 1558 public long skip(final long bytes) throws IOException { 1559 if (closed) { 1560 throw new FileItemStream.ItemSkippedException(); 1561 } 1562 int av = available(); 1563 if (av == 0) { 1564 av = makeAvailable(); 1565 if (av == 0) { 1566 return 0; 1567 } 1568 } 1569 final long res = Math.min(av, bytes); 1570 head += res; 1571 return res; 1572 } 1573 1574 /** 1575 * Attempts to read more data. 1576 * 1577 * @return Number of available bytes 1578 * @throws IOException An I/O error occurred. 1579 */ 1580 private int makeAvailable() throws IOException { 1581 if (pos != -1) { 1582 return 0; 1583 } 1584 1585 // Move the data to the beginning of the buffer. 1586 total += tail - head - pad; 1587 System.arraycopy(buffer, tail - pad, buffer, 0, pad); 1588 1589 // Refill buffer with new data. 1590 head = 0; 1591 tail = pad; 1592 1593 for (; ; ) { 1594 final int bytesRead = input.read(buffer, tail, bufSize - tail); 1595 if (bytesRead == -1) { 1596 // The last pad amount is left in the buffer. 1597 // Boundary can't be in there so signal an error 1598 // condition. 1599 final String msg = "Stream ended unexpectedly"; 1600 throw new MalformedStreamException(msg); 1601 } 1602 if (notifier != null) { 1603 notifier.noteBytesRead(bytesRead); 1604 } 1605 tail += bytesRead; 1606 1607 findSeparator(); 1608 final int av = available(); 1609 1610 if (av > 0 || pos != -1) { 1611 return av; 1612 } 1613 } 1614 } 1615 1616 /** 1617 * Returns, whether the stream is closed. 1618 * 1619 * @return True, if the stream is closed, otherwise false. 1620 */ 1621 public boolean isClosed() { 1622 return closed; 1623 } 1624 1625 } 1626 } 1627 1628 /** 1629 * A simple parser intended to parse sequences of name/value pairs. 1630 * <p> 1631 * Parameter values are expected to be enclosed in quotes if they contain unsafe characters, such as '=' characters or separators. Parameter values are optional 1632 * and can be omitted. 1633 * </p> 1634 * <p> 1635 * {@code param1 = value; param2 = "anything goes; really"; param3} 1636 * </p> 1637 */ 1638 protected class ParameterParser { 1639 1640 /** 1641 * String to be parsed. 1642 */ 1643 private char[] chars = null; 1644 1645 /** 1646 * Current position in the string. 1647 */ 1648 private int pos = 0; 1649 1650 /** 1651 * Maximum position in the string. 1652 */ 1653 private int len = 0; 1654 1655 /** 1656 * Start of a token. 1657 */ 1658 private int i1 = 0; 1659 1660 /** 1661 * End of a token. 1662 */ 1663 private int i2 = 0; 1664 1665 /** 1666 * Whether names stored in the map should be converted to lower case. 1667 */ 1668 private boolean lowerCaseNames = false; 1669 1670 /** 1671 * Default ParameterParser constructor. 1672 */ 1673 public ParameterParser() { 1674 } 1675 1676 /** 1677 * A helper method to process the parsed token. This method removes leading and trailing blanks as well as enclosing quotation marks, when necessary. 1678 * 1679 * @param quoted {@code true} if quotation marks are expected, {@code false} otherwise. 1680 * @return the token 1681 */ 1682 private String getToken(final boolean quoted) { 1683 // Trim leading white spaces 1684 while (i1 < i2 && Character.isWhitespace(chars[i1])) { 1685 i1++; 1686 } 1687 // Trim trailing white spaces 1688 while (i2 > i1 && Character.isWhitespace(chars[i2 - 1])) { 1689 i2--; 1690 } 1691 // Strip away quotation marks if necessary 1692 if (quoted && i2 - i1 >= 2 && chars[i1] == '"' && chars[i2 - 1] == '"') { 1693 i1++; 1694 i2--; 1695 } 1696 String result = null; 1697 if (i2 > i1) { 1698 result = new String(chars, i1, i2 - i1); 1699 } 1700 return result; 1701 } 1702 1703 /** 1704 * Tests if there any characters left to parse. 1705 * 1706 * @return {@code true} if there are unparsed characters, {@code false} otherwise. 1707 */ 1708 private boolean hasChar() { 1709 return this.pos < this.len; 1710 } 1711 1712 /** 1713 * Tests {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed. 1714 * 1715 * @return {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed. Otherwise returns {@code false} 1716 */ 1717 public boolean isLowerCaseNames() { 1718 return this.lowerCaseNames; 1719 } 1720 1721 /** 1722 * Tests if the given character is present in the array of characters. 1723 * 1724 * @param ch the character to test for presence in the array of characters 1725 * @param charray the array of characters to test against 1726 * @return {@code true} if the character is present in the array of characters, {@code false} otherwise. 1727 */ 1728 private boolean isOneOf(final char ch, final char[] charray) { 1729 var result = false; 1730 for (final char element : charray) { 1731 if (ch == element) { 1732 result = true; 1733 break; 1734 } 1735 } 1736 return result; 1737 } 1738 1739 /** 1740 * Parses a map of name/value pairs from the given array of characters. Names are expected to be unique. 1741 * 1742 * @param charArray the array of characters that contains a sequence of name/value pairs 1743 * @param separator the name/value pairs separator 1744 * @return a map of name/value pairs 1745 */ 1746 public Map<String, String> parse(final char[] charArray, final char separator) { 1747 if (charArray == null) { 1748 return new LinkedHashMap<>(); 1749 } 1750 return parse(charArray, 0, charArray.length, separator); 1751 } 1752 1753 /** 1754 * Parses a map of name/value pairs from the given array of characters. Names are expected to be unique. 1755 * 1756 * @param charArray the array of characters that contains a sequence of name/value pairs 1757 * @param offset - the initial offset. 1758 * @param length - the length. 1759 * @param separator the name/value pairs separator 1760 * @return a map of name/value pairs 1761 */ 1762 public Map<String, String> parse(final char[] charArray, final int offset, final int length, final char separator) { 1763 1764 if (charArray == null) { 1765 return new LinkedHashMap<>(); 1766 } 1767 final var params = new LinkedHashMap<String, String>(); 1768 this.chars = charArray.clone(); 1769 this.pos = offset; 1770 this.len = length; 1771 1772 String paramName; 1773 String paramValue; 1774 while (hasChar()) { 1775 paramName = parseToken(new char[]{'=', separator}); 1776 paramValue = null; 1777 if (hasChar() && charArray[pos] == '=') { 1778 pos++; // skip '=' 1779 paramValue = parseQuotedToken(new char[]{separator}); 1780 1781 if (paramValue != null) { 1782 try { 1783 paramValue = RFC2231Utils.hasEncodedValue(paramName) ? RFC2231Utils.decodeText(paramValue) : MimeUtils.decodeText(paramValue); 1784 } catch (final UnsupportedEncodingException ignored) { 1785 // let's keep the original value in this case 1786 } 1787 } 1788 } 1789 if (hasChar() && charArray[pos] == separator) { 1790 pos++; // skip separator 1791 } 1792 if (paramName != null && !paramName.isEmpty()) { 1793 paramName = RFC2231Utils.stripDelimiter(paramName); 1794 if (this.lowerCaseNames) { 1795 paramName = paramName.toLowerCase(Locale.ENGLISH); 1796 } 1797 params.put(paramName, paramValue); 1798 } 1799 } 1800 return params; 1801 } 1802 1803 /** 1804 * Parses a map of name/value pairs from the given string. Names are expected to be unique. 1805 * 1806 * @param str the string that contains a sequence of name/value pairs 1807 * @param separator the name/value pairs separator 1808 * @return a map of name/value pairs 1809 */ 1810 public Map<String, String> parse(final String str, final char separator) { 1811 if (str == null) { 1812 return new LinkedHashMap<>(); 1813 } 1814 return parse(str.toCharArray(), separator); 1815 } 1816 1817 /** 1818 * Parses a map of name/value pairs from the given string. Names are expected to be unique. Multiple separators may be specified and the earliest found in 1819 * the input string is used. 1820 * 1821 * @param str the string that contains a sequence of name/value pairs 1822 * @param separators the name/value pairs separators 1823 * @return a map of name/value pairs 1824 */ 1825 public Map<String, String> parse(final String str, final char[] separators) { 1826 if (separators == null || separators.length == 0) { 1827 return new LinkedHashMap<>(); 1828 } 1829 var separator = separators[0]; 1830 if (str != null) { 1831 var idx = str.length(); 1832 for (final char separator2 : separators) { 1833 final var tmp = str.indexOf(separator2); 1834 if (tmp != -1 && tmp < idx) { 1835 idx = tmp; 1836 separator = separator2; 1837 } 1838 } 1839 } 1840 return parse(str, separator); 1841 } 1842 1843 /** 1844 * Parses out a token until any of the given terminators is encountered outside the quotation marks. 1845 * 1846 * @param terminators the array of terminating characters. Any of these characters when encountered outside the quotation marks signify the end of the token 1847 * @return the token 1848 */ 1849 private String parseQuotedToken(final char[] terminators) { 1850 char ch; 1851 i1 = pos; 1852 i2 = pos; 1853 var quoted = false; 1854 var charEscaped = false; 1855 while (hasChar()) { 1856 ch = chars[pos]; 1857 if (!quoted && isOneOf(ch, terminators)) { 1858 break; 1859 } 1860 if (!charEscaped && ch == '"') { 1861 quoted = !quoted; 1862 } 1863 charEscaped = !charEscaped && ch == '\\'; 1864 i2++; 1865 pos++; 1866 1867 } 1868 return getToken(true); 1869 } 1870 1871 /** 1872 * Parses out a token until any of the given terminators is encountered. 1873 * 1874 * @param terminators the array of terminating characters. Any of these characters when encountered signify the end of the token 1875 * @return the token 1876 */ 1877 private String parseToken(final char[] terminators) { 1878 char ch; 1879 i1 = pos; 1880 i2 = pos; 1881 while (hasChar()) { 1882 ch = chars[pos]; 1883 if (isOneOf(ch, terminators)) { 1884 break; 1885 } 1886 i2++; 1887 pos++; 1888 } 1889 return getToken(false); 1890 } 1891 1892 /** 1893 * Sets the flag if parameter names are to be converted to lower case when name/value pairs are parsed. 1894 * 1895 * @param lowerCaseNames {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed. {@code false} otherwise. 1896 */ 1897 public void setLowerCaseNames(final boolean lowerCaseNames) { 1898 this.lowerCaseNames = lowerCaseNames; 1899 } 1900 1901 } 1902 1903 /** 1904 * Utility class to decode/encode character set on HTTP Header fields based on RFC 2231. This implementation adheres to RFC 5987 in particular, which was 1905 * defined for HTTP headers 1906 * <p> 1907 * RFC 5987 builds on RFC 2231, but has lesser scope like <a href="https://tools.ietf.org/html/rfc5987#section-3.2">mandatory charset definition</a> and 1908 * <a href="https://tools.ietf.org/html/rfc5987#section-4">no parameter continuation</a> 1909 * </p> 1910 * 1911 * @see <a href="https://tools.ietf.org/html/rfc2231">RFC 2231</a> 1912 * @see <a href="https://tools.ietf.org/html/rfc5987">RFC 5987</a> 1913 */ 1914 protected final class RFC2231Utils { 1915 1916 /** 1917 * The Hexadecimal values char array. 1918 */ 1919 private static final char[] HEX_DIGITS = "0123456789ABCDEF".toCharArray(); 1920 /** 1921 * The Hexadecimal representation of 127. 1922 */ 1923 private static final byte MASK = 0x7f; 1924 /** 1925 * The Hexadecimal representation of 128. 1926 */ 1927 private static final int MASK_128 = 0x80; 1928 /** 1929 * The Hexadecimal decode value. 1930 */ 1931 private static final byte[] HEX_DECODE = new byte[MASK_128]; 1932 1933 // create a ASCII decoded array of Hexadecimal values 1934 static { 1935 for (var i = 0; i < HEX_DIGITS.length; i++) { 1936 HEX_DECODE[HEX_DIGITS[i]] = (byte) i; 1937 HEX_DECODE[Character.toLowerCase(HEX_DIGITS[i])] = (byte) i; 1938 } 1939 } 1940 1941 /** 1942 * Decodes a string of text obtained from a HTTP header as per RFC 2231 1943 * 1944 * <b>Eg 1.</b> {@code us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A} will be decoded to {@code This is ***fun***} 1945 * 1946 * <b>Eg 2.</b> {@code iso-8859-1'en'%A3%20rate} will be decoded to {@code £ rate}. 1947 * 1948 * <b>Eg 3.</b> {@code UTF-8''%c2%a3%20and%20%e2%82%ac%20rates} will be decoded to {@code £ and € rates}. 1949 * 1950 * @param encodedText - Text to be decoded has a format of {@code <charset>'<language>'<encoded_value>} and ASCII only 1951 * @return Decoded text based on charset encoding 1952 * @throws UnsupportedEncodingException The requested character set wasn't found. 1953 */ 1954 static String decodeText(final String encodedText) throws UnsupportedEncodingException { 1955 final var langDelimitStart = encodedText.indexOf('\''); 1956 if (langDelimitStart == -1) { 1957 // missing charset 1958 return encodedText; 1959 } 1960 final var mimeCharset = encodedText.substring(0, langDelimitStart); 1961 final var langDelimitEnd = encodedText.indexOf('\'', langDelimitStart + 1); 1962 if (langDelimitEnd == -1) { 1963 // missing language 1964 return encodedText; 1965 } 1966 final var bytes = fromHex(encodedText.substring(langDelimitEnd + 1)); 1967 return new String(bytes, getJavaCharset(mimeCharset)); 1968 } 1969 1970 /** 1971 * Converts {@code text} to their corresponding Hex value. 1972 * 1973 * @param text - ASCII text input 1974 * @return Byte array of characters decoded from ASCII table 1975 */ 1976 private static byte[] fromHex(final String text) { 1977 final var shift = 4; 1978 final var out = new ByteArrayOutputStream(text.length()); 1979 for (var i = 0; i < text.length(); ) { 1980 final var c = text.charAt(i++); 1981 if (c == '%') { 1982 if (i > text.length() - 2) { 1983 break; // unterminated sequence 1984 } 1985 final var b1 = HEX_DECODE[text.charAt(i++) & MASK]; 1986 final var b2 = HEX_DECODE[text.charAt(i++) & MASK]; 1987 out.write(b1 << shift | b2); 1988 } else { 1989 out.write((byte) c); 1990 } 1991 } 1992 return out.toByteArray(); 1993 } 1994 1995 private static String getJavaCharset(final String mimeCharset) { 1996 // good enough for standard values 1997 return mimeCharset; 1998 } 1999 2000 /** 2001 * Tests if asterisk (*) at the end of parameter name to indicate, if it has charset and language information to decode the value. 2002 * 2003 * @param paramName The parameter, which is being checked. 2004 * @return {@code true}, if encoded as per RFC 2231, {@code false} otherwise 2005 */ 2006 static boolean hasEncodedValue(final String paramName) { 2007 if (paramName != null) { 2008 return paramName.lastIndexOf('*') == paramName.length() - 1; 2009 } 2010 return false; 2011 } 2012 2013 /** 2014 * If {@code paramName} has Asterisk (*) at the end, it will be stripped off, else the passed value will be returned. 2015 * 2016 * @param paramName The parameter, which is being inspected. 2017 * @return stripped {@code paramName} of Asterisk (*), if RFC2231 encoded 2018 */ 2019 static String stripDelimiter(final String paramName) { 2020 if (hasEncodedValue(paramName)) { 2021 final var paramBuilder = new StringBuilder(paramName); 2022 paramBuilder.deleteCharAt(paramName.lastIndexOf('*')); 2023 return paramBuilder.toString(); 2024 } 2025 return paramName; 2026 } 2027 2028 /** 2029 * Private constructor so that no instances can be created. This class contains only static utility methods. 2030 */ 2031 private RFC2231Utils() { 2032 } 2033 } 2034 2035 /** 2036 * Utility class to decode MIME texts. 2037 */ 2038 protected final class MimeUtils { 2039 2040 /** 2041 * The marker to indicate text is encoded with BASE64 algorithm. 2042 */ 2043 private static final String BASE64_ENCODING_MARKER = "B"; 2044 2045 /** 2046 * The marker to indicate text is encoded with QuotedPrintable algorithm. 2047 */ 2048 private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q"; 2049 2050 /** 2051 * If the text contains any encoded tokens, those tokens will be marked with "=?". 2052 */ 2053 private static final String ENCODED_TOKEN_MARKER = "=?"; 2054 2055 /** 2056 * If the text contains any encoded tokens, those tokens will terminate with "=?". 2057 */ 2058 private static final String ENCODED_TOKEN_FINISHER = "?="; 2059 2060 /** 2061 * The linear whitespace chars sequence. 2062 */ 2063 private static final String LINEAR_WHITESPACE = " \t\r\n"; 2064 2065 /** 2066 * Mappings between MIME and Java charset. 2067 */ 2068 private static final Map<String, String> MIME2JAVA = new HashMap<>(); 2069 2070 static { 2071 MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); 2072 MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); 2073 MIME2JAVA.put("utf-8", "UTF8"); 2074 MIME2JAVA.put("utf8", "UTF8"); 2075 MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); 2076 MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); 2077 MIME2JAVA.put("euc-kr", "KSC5601"); 2078 MIME2JAVA.put("euckr", "KSC5601"); 2079 MIME2JAVA.put("us-ascii", "ISO-8859-1"); 2080 MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); 2081 } 2082 2083 /** 2084 * Decodes a string of text obtained from a mail header into its proper form. The text generally will consist of a string of tokens, some of which may be 2085 * encoded using base64 encoding. 2086 * 2087 * @param text The text to decode. 2088 * @return The decoded text string. 2089 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported. 2090 */ 2091 static String decodeText(final String text) throws UnsupportedEncodingException { 2092 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the 2093 // source string doesn't contain that sequent, no decoding is required. 2094 if (!text.contains(ENCODED_TOKEN_MARKER)) { 2095 return text; 2096 } 2097 2098 var offset = 0; 2099 final var endOffset = text.length(); 2100 2101 var startWhiteSpace = -1; 2102 var endWhiteSpace = -1; 2103 2104 final var decodedText = new StringBuilder(text.length()); 2105 2106 var previousTokenEncoded = false; 2107 2108 while (offset < endOffset) { 2109 var ch = text.charAt(offset); 2110 2111 // is this a whitespace character? 2112 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found 2113 startWhiteSpace = offset; 2114 while (offset < endOffset) { 2115 // step over the white space characters. 2116 ch = text.charAt(offset); 2117 if (LINEAR_WHITESPACE.indexOf(ch) == -1) { 2118 // record the location of the first non lwsp and drop down to process the 2119 // token characters. 2120 endWhiteSpace = offset; 2121 break; 2122 } 2123 offset++; 2124 } 2125 } else { 2126 // we have a word token. We need to scan over the word and then try to parse it. 2127 final var wordStart = offset; 2128 2129 while (offset < endOffset) { 2130 // step over the non white space characters. 2131 ch = text.charAt(offset); 2132 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { 2133 break; 2134 } 2135 offset++; 2136 2137 // NB: Trailing whitespace on these header strings will just be discarded. 2138 } 2139 // pull out the word token. 2140 final var word = text.substring(wordStart, offset); 2141 // is the token encoded? decode the word 2142 if (word.startsWith(ENCODED_TOKEN_MARKER)) { 2143 try { 2144 // if this gives a parsing failure, treat it like a non-encoded word. 2145 final var decodedWord = decodeWord(word); 2146 2147 // are any whitespace characters significant? Append 'em if we've got 'em. 2148 if (!previousTokenEncoded && startWhiteSpace != -1) { 2149 decodedText.append(text, startWhiteSpace, endWhiteSpace); 2150 startWhiteSpace = -1; 2151 } 2152 // this is definitely a decoded token. 2153 previousTokenEncoded = true; 2154 // and add this to the text. 2155 decodedText.append(decodedWord); 2156 // we continue parsing from here...we allow parsing errors to fall through 2157 // and get handled as normal text. 2158 continue; 2159 2160 } catch (final ParseException ignored) { 2161 // just ignore it, skip to next word 2162 } 2163 } 2164 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 2165 // if we have it. 2166 if (startWhiteSpace != -1) { 2167 decodedText.append(text, startWhiteSpace, endWhiteSpace); 2168 startWhiteSpace = -1; 2169 } 2170 // this is not a decoded token. 2171 previousTokenEncoded = false; 2172 decodedText.append(word); 2173 } 2174 } 2175 2176 return decodedText.toString(); 2177 } 2178 2179 /** 2180 * Decodes a string using the RFC 2047 rules for an "encoded-word" type. This encoding has the syntax: 2181 * <p> 2182 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 2183 * 2184 * @param word The possibly encoded word value. 2185 * @return The decoded word. 2186 * @throws ParseException in case of a parse error of the RFC 2047. 2187 * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found. 2188 */ 2189 private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException { 2190 // encoded words start with the characters "=?". If this not an encoded word, we throw a 2191 // ParseException for the caller. 2192 2193 final var etmPos = word.indexOf(ENCODED_TOKEN_MARKER); 2194 if (etmPos != 0) { 2195 throw new ParseException("Invalid RFC 2047 encoded-word: " + word, etmPos); 2196 } 2197 2198 final var charsetPos = word.indexOf('?', 2); 2199 if (charsetPos == -1) { 2200 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word, charsetPos); 2201 } 2202 2203 // pull out the character set information (this is the MIME name at this point). 2204 final var charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH); 2205 2206 // now pull out the encoding token the same way. 2207 final var encodingPos = word.indexOf('?', charsetPos + 1); 2208 if (encodingPos == -1) { 2209 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word, encodingPos); 2210 } 2211 2212 final var encoding = word.substring(charsetPos + 1, encodingPos); 2213 2214 // and finally the encoded text. 2215 final var encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1); 2216 if (encodedTextPos == -1) { 2217 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word, encodedTextPos); 2218 } 2219 2220 final var encodedText = word.substring(encodingPos + 1, encodedTextPos); 2221 2222 // seems a bit silly to encode a null string, but easy to deal with. 2223 if (encodedText.isEmpty()) { 2224 return ""; 2225 } 2226 2227 try { 2228 // the decoder writes directly to an output stream. 2229 final var out = new ByteArrayOutputStream(encodedText.length()); 2230 2231 final var encodedData = encodedText.getBytes(StandardCharsets.US_ASCII); 2232 2233 // Base64 encoded? 2234 if (encoding.equals(BASE64_ENCODING_MARKER)) { 2235 out.write(Base64.getMimeDecoder().decode(encodedData)); 2236 } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable. 2237 QuotedPrintableDecoder.decode(encodedData, out); 2238 } else { 2239 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); 2240 } 2241 // get the decoded byte data and convert into a string. 2242 final var decodedData = out.toByteArray(); 2243 return new String(decodedData, javaCharset(charset)); 2244 } catch (final IOException e) { 2245 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); 2246 } 2247 } 2248 2249 /** 2250 * Translate a MIME standard character set name into the Java equivalent. 2251 * 2252 * @param charset The MIME standard name. 2253 * @return The Java equivalent for this name. 2254 */ 2255 private static String javaCharset(final String charset) { 2256 // nothing in, nothing out. 2257 if (charset == null) { 2258 return null; 2259 } 2260 final var mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH)); 2261 // if there is no mapping, then the original name is used. Many of the MIME character set 2262 // names map directly back into Java. The reverse isn't necessarily true. 2263 return mappedCharset == null ? charset : mappedCharset; 2264 } 2265 2266 /** 2267 * Hidden constructor, this class must not be instantiated. 2268 */ 2269 private MimeUtils() { 2270 // do nothing 2271 } 2272 2273 } 2274 2275 protected final class QuotedPrintableDecoder { 2276 2277 /** 2278 * The shift value required to create the upper nibble from the first of 2 byte values converted from ASCII hex. 2279 */ 2280 private static final int UPPER_NIBBLE_SHIFT = Byte.SIZE / 2; 2281 2282 /** 2283 * Decodes the encoded byte data writing it to the given output stream. 2284 * 2285 * @param data The array of byte data to decode. 2286 * @param out The output stream used to return the decoded data. 2287 * @return the number of bytes produced. 2288 * @throws IOException if an IO error occurs 2289 */ 2290 public static int decode(final byte[] data, final OutputStream out) throws IOException { 2291 var off = 0; 2292 final var length = data.length; 2293 final var endOffset = off + length; 2294 var bytesWritten = 0; 2295 2296 while (off < endOffset) { 2297 final var ch = data[off++]; 2298 2299 // space characters were translated to '_' on encode, so we need to translate them back. 2300 if (ch == '_') { 2301 out.write(' '); 2302 } else if (ch == '=') { 2303 // we found an encoded character. Reduce the 3 char sequence to one. 2304 // but first, make sure we have two characters to work with. 2305 if (off + 1 >= endOffset) { 2306 throw new IOException("Invalid quoted printable encoding; truncated escape sequence"); 2307 } 2308 2309 final var b1 = data[off++]; 2310 final var b2 = data[off++]; 2311 2312 // we've found an encoded carriage return. The next char needs to be a newline 2313 if (b1 == '\r') { 2314 if (b2 != '\n') { 2315 throw new IOException("Invalid quoted printable encoding; CR must be followed by LF"); 2316 } 2317 // this was a soft linebreak inserted by the encoding. We just toss this away 2318 // on decode. 2319 } else { 2320 // this is a hex pair we need to convert back to a single byte. 2321 final var c1 = hexToBinary(b1); 2322 final var c2 = hexToBinary(b2); 2323 out.write(c1 << UPPER_NIBBLE_SHIFT | c2); 2324 // 3 bytes in, one byte out 2325 bytesWritten++; 2326 } 2327 } else { 2328 // simple character, just write it out. 2329 out.write(ch); 2330 bytesWritten++; 2331 } 2332 } 2333 2334 return bytesWritten; 2335 } 2336 2337 /** 2338 * Converts a hexadecimal digit to the binary value it represents. 2339 * 2340 * @param b the ASCII hexadecimal byte to convert (0-0, A-F, a-f) 2341 * @return the int value of the hexadecimal byte, 0-15 2342 * @throws IOException if the byte is not a valid hexadecimal digit. 2343 */ 2344 private static int hexToBinary(final byte b) throws IOException { 2345 // CHECKSTYLE IGNORE MagicNumber FOR NEXT 1 LINE 2346 final var i = Character.digit((char) b, 16); 2347 if (i == -1) { 2348 throw new IOException("Invalid quoted printable encoding: not a valid hex digit: " + b); 2349 } 2350 return i; 2351 } 2352 2353 /** 2354 * Hidden constructor, this class must not be instantiated. 2355 */ 2356 private QuotedPrintableDecoder() { 2357 // do nothing 2358 } 2359 2360 } 2361 2362 // *** END commons-fileupload source *** 2363 2364 // For HTML-Unescaper below, see https://gist.github.com/MarkJeronimus/798c452582e64410db769933ec71cfb7 2365 2366 // *** START HTML-Unescaper source *** 2367 2368 /** 2369 * HTML Un-escaper by Nick Frolov. 2370 * <p> 2371 * With improvement suggested by Axel Dörfler. 2372 * <p> 2373 * Replaced character map with HTML5 characters from<a href="https://www.w3schools.com/charsets/ref_html_entities_a.asp"> 2374 * https://www.w3schools.com/charsets/ref_html_entities_a.asp</a> 2375 * 2376 * @author Nick Frolov, Mark Jeronimus 2377 */ 2378// Created 2020-06-22 2379 protected static class HTMLUtilities { 2380 // Tables optimized for smallest .class size (without resorting to compression) 2381 private static final String[] NAMES = 2382 {"excl", "quot", "num", "dollar", "percnt", "amp", "apos", "lpar", "rpar", "ast", "midast", "plus", "comma", 2383 "period", "sol", "colon", "semi", "lt", "equals", "GT", "quest", "commat", "lbrack", "lsqb", "bsol", 2384 "rbrack", "rsqb", "Hat", "lowbar", "UnderBar", "DiacriticalGrave", "grave", "lbrace", "lcub", "verbar", 2385 "vert", "VerticalLine", "rbrace", "rcub", "nbsp", "NonBreakingSpace", "iexcl", "cent", "pound", "curren", 2386 "yen", "brvbar", "sect", "die", "Dot", "DoubleDot", "uml", "copy", "ordf", "laquo", "not", "shy", 2387 "circledR", "reg", "macr", "strns", "deg", "plusmn", "pm", "sup2", "sup3", "acute", "DiacriticalAcute", 2388 "micro", "para", "CenterDot", "centerdot", "middot", "cedil", "Cedilla", "sup1", "ordm", "raquo", "frac14", 2389 "frac12", "half", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "angst", "Aring", 2390 "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", 2391 "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", 2392 "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", 2393 "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", 2394 "ograve", "oacute", "ocirc", "otilde", "ouml", "div", "divide", "oslash", "ugrave", "uacute", "ucirc", 2395 "uuml", "yacute", "thorn", "yuml", "Amacr", "amacr", "Abreve", "abreve", "Aogon", "aogon", "Cacute", 2396 "cacute", "Ccirc", "ccirc", "Cdot", "cdot", "Ccaron", "ccaron", "Dcaron", "dcaron", "Dstrok", "dstrok", 2397 "Emacr", "emacr", "Edot", "edot", "Eogon", "eogon", "Ecaron", "ecaron", "Gcirc", "gcirc", "Gbreve", 2398 "gbreve", "Gdot", "gdot", "Gcedil", "Hcirc", "hcirc", "Hstrok", "hstrok", "Itilde", "itilde", "Imacr", 2399 "imacr", "Iogon", "iogon", "Idot", "imath", "inodot", "IJlig", "ijlig", "Jcirc", "jcirc", "Kcedil", 2400 "kcedil", "kgreen", "Lacute", "lacute", "Lcedil", "lcedil", "Lcaron", "lcaron", "Lmidot", "lmidot", 2401 "Lstrok", "lstrok", "Nacute", "nacute", "Ncedil", "ncedil", "Ncaron", "ncaron", "napos", "ENG", "eng", 2402 "Omacr", "omacr", "Odblac", "odblac", "OElig", "oelig", "Racute", "racute", "Rcedil", "rcedil", "Rcaron", 2403 "rcaron", "Sacute", "sacute", "Scirc", "scirc", "Scedil", "scedil", "Scaron", "scaron", "Tcedil", "tcedil", 2404 "Tcaron", "tcaron", "Tstrok", "tstrok", "Utilde", "utilde", "Umacr", "umacr", "Ubreve", "ubreve", "Uring", 2405 "uring", "Udblac", "udblac", "Uogon", "uogon", "Wcirc", "wcirc", "Ycirc", "ycirc", "Yuml", "Zacute", 2406 "zacute", "Zdot", "zdot", "Zcaron", "zcaron", "fnof", "imped", "gacute", "jmath", "circ", "caron", "Hacek", 2407 "Breve", "breve", "DiacriticalDot", "dot", "ring", "ogon", "DiacriticalTilde", "tilde", "dblac", 2408 "DiacriticalDoubleAcute", "DownBreve", "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", 2409 "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", "Sigma", "Tau", "Upsilon", 2410 "Phi", "Chi", "Psi", "ohm", "Omega", "alpha", "beta", "gamma", "delta", "epsi", "epsilon", "zeta", "eta", 2411 "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigmaf", "sigmav", 2412 "varsigma", "sigma", "tau", "upsi", "upsilon", "phi", "chi", "psi", "omega", "thetasym", "thetav", 2413 "vartheta", "Upsi", "upsih", "phiv", "straightphi", "varphi", "piv", "varpi", "Gammad", "digamma", 2414 "gammad", "kappav", "varkappa", "rhov", "varrho", "epsiv", "straightepsilon", "varepsilon", "backepsilon", 2415 "bepsi", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy", "Jsercy", "LJcy", "NJcy", "TSHcy", 2416 "KJcy", "Ubrcy", "DZcy", "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy", "Icy", "Jcy", "Kcy", 2417 "Lcy", "Mcy", "Ncy", "Ocy", "Pcy", "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy", "SHcy", 2418 "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy", "acy", "bcy", "vcy", "gcy", "dcy", "iecy", 2419 "zhcy", "zcy", "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy", "rcy", "scy", "tcy", "ucy", "fcy", 2420 "khcy", "tscy", "chcy", "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy", "iocy", "djcy", 2421 "gjcy", "jukcy", "dscy", "iukcy", "yicy", "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "ubrcy", "dzcy", 2422 "ensp", "emsp", "emsp13", "emsp14", "numsp", "puncsp", "thinsp", "ThinSpace", "hairsp", "VeryThinSpace", 2423 "ZeroWidthSpace", "zwnj", "zwj", "lrm", "rlm", "dash", "hyphen", "ndash", "mdash", "horbar", "Verbar", 2424 "Vert", "lsquo", "OpenCurlyQuote", "CloseCurlyQuote", "rsquo", "rsquor", "lsquor", "sbquo", "ldquo", 2425 "OpenCurlyDoubleQuote", "CloseCurlyDoubleQuote", "rdquo", "rdquor", "bdquo", "ldquor", "dagger", "ddagger", 2426 "bull", "bullet", "nldr", "hellip", "mldr", "permil", "pertenk", "prime", "Prime", "tprime", "backprime", 2427 "bprime", "lsaquo", "rsaquo", "oline", "OverBar", "caret", "hybull", "frasl", "bsemi", "qprime", 2428 "MediumSpace", "NoBreak", "af", "ApplyFunction", "InvisibleTimes", "it", "ic", "InvisibleComma", "euro", 2429 "tdot", "TripleDot", "DotDot", "complexes", "Copf", "incare", "gscr", "hamilt", "HilbertSpace", "Hscr", 2430 "Hfr", "Poincareplane", "Hopf", "quaternions", "planckh", "hbar", "hslash", "planck", "plankv", "imagline", 2431 "Iscr", "Ifr", "Im", "image", "imagpart", "lagran", "Laplacetrf", "Lscr", "ell", "naturals", "Nopf", 2432 "numero", "copysr", "weierp", "wp", "Popf", "primes", "Qopf", "rationals", "realine", "Rscr", "Re", "real", 2433 "realpart", "Rfr", "reals", "Ropf", "rx", "TRADE", "trade", "integers", "Zopf", "mho", "zeetrf", "Zfr", 2434 "iiota", "bernou", "Bernoullis", "Bscr", "Cayleys", "Cfr", "escr", "Escr", "expectation", "Fouriertrf", 2435 "Fscr", "Mellintrf", "Mscr", "phmmat", "order", "orderof", "oscr", "alefsym", "aleph", "beth", "gimel", 2436 "daleth", "CapitalDifferentialD", "DD", "dd", "DifferentialD", "ee", "ExponentialE", "exponentiale", "ii", 2437 "ImaginaryI", "frac13", "frac23", "frac15", "frac25", "frac35", "frac45", "frac16", "frac56", "frac18", 2438 "frac38", "frac58", "frac78", "larr", "LeftArrow", "leftarrow", "ShortLeftArrow", "slarr", "ShortUpArrow", 2439 "uarr", "UpArrow", "uparrow", "rarr", "RightArrow", "rightarrow", "ShortRightArrow", "srarr", "darr", 2440 "DownArrow", "downarrow", "ShortDownArrow", "harr", "LeftRightArrow", "leftrightarrow", "UpDownArrow", 2441 "updownarrow", "varr", "nwarr", "nwarrow", "UpperLeftArrow", "nearr", "nearrow", "UpperRightArrow", 2442 "LowerRightArrow", "searr", "searrow", "LowerLeftArrow", "swarr", "swarrow", "nlarr", "nleftarrow", 2443 "nrarr", "nrightarrow", "rarrw", "rightsquigarrow", "Larr", "twoheadleftarrow", "Uarr", "Rarr", 2444 "twoheadrightarrow", "Darr", "larrtl", "leftarrowtail", "rarrtl", "rightarrowtail", "LeftTeeArrow", 2445 "mapstoleft", "mapstoup", "UpTeeArrow", "map", "mapsto", "RightTeeArrow", "DownTeeArrow", "mapstodown", 2446 "hookleftarrow", "larrhk", "hookrightarrow", "rarrhk", "larrlp", "looparrowleft", "looparrowright", 2447 "rarrlp", "harrw", "leftrightsquigarrow", "nharr", "nleftrightarrow", "Lsh", "lsh", "Rsh", "rsh", "ldsh", 2448 "rdsh", "crarr", "cularr", "curvearrowleft", "curarr", "curvearrowright", "circlearrowleft", "olarr", 2449 "circlearrowright", "orarr", "leftharpoonup", "LeftVector", "lharu", "DownLeftVector", "leftharpoondown", 2450 "lhard", "RightUpVector", "uharr", "upharpoonright", "LeftUpVector", "uharl", "upharpoonleft", "rharu", 2451 "rightharpoonup", "RightVector", "DownRightVector", "rhard", "rightharpoondown", "dharr", 2452 "downharpoonright", "RightDownVector", "dharl", "downharpoonleft", "LeftDownVector", "RightArrowLeftArrow", 2453 "rightleftarrows", "rlarr", "udarr", "UpArrowDownArrow", "LeftArrowRightArrow", "leftrightarrows", "lrarr", 2454 "leftleftarrows", "llarr", "upuparrows", "uuarr", "rightrightarrows", "rrarr", "ddarr", "downdownarrows", 2455 "leftrightharpoons", "lrhar", "ReverseEquilibrium", "Equilibrium", "rightleftharpoons", "rlhar", "nlArr", 2456 "nLeftarrow", "nhArr", "nLeftrightarrow", "nrArr", "nRightarrow", "DoubleLeftArrow", "lArr", "Leftarrow", 2457 "DoubleUpArrow", "uArr", "Uparrow", "DoubleRightArrow", "Implies", "rArr", "Rightarrow", "dArr", 2458 "DoubleDownArrow", "Downarrow", "DoubleLeftRightArrow", "hArr", "iff", "Leftrightarrow", 2459 "DoubleUpDownArrow", "Updownarrow", "vArr", "nwArr", "neArr", "seArr", "swArr", "lAarr", "Lleftarrow", 2460 "rAarr", "Rrightarrow", "zigrarr", "larrb", "LeftArrowBar", "rarrb", "RightArrowBar", "DownArrowUpArrow", 2461 "duarr", "loarr", "roarr", "hoarr", "ForAll", "forall", "comp", "complement", "part", "PartialD", "Exists", 2462 "exist", "nexist", "nexists", "NotExists", "empty", "emptyset", "emptyv", "varnothing", "Del", "nabla", 2463 "Element", "in", "isin", "isinv", "NotElement", "notin", "notinva", "ni", "niv", "ReverseElement", 2464 "SuchThat", "notni", "notniva", "NotReverseElement", "prod", "Product", "coprod", "Coproduct", "Sum", 2465 "sum", "minus", "MinusPlus", "mnplus", "mp", "dotplus", "plusdo", "Backslash", "setminus", "setmn", 2466 "smallsetminus", "ssetmn", "lowast", "compfn", "SmallCircle", "radic", "Sqrt", "prop", "Proportional", 2467 "propto", "varpropto", "vprop", "infin", "angrt", "ang", "angle", "angmsd", "measuredangle", "angsph", 2468 "mid", "shortmid", "smid", "VerticalBar", "nmid", "NotVerticalBar", "nshortmid", "nsmid", 2469 "DoubleVerticalBar", "par", "parallel", "shortparallel", "spar", "NotDoubleVerticalBar", "npar", 2470 "nparallel", "nshortparallel", "nspar", "and", "wedge", "or", "vee", "cap", "cup", "int", "Integral", 2471 "Int", "iiint", "tint", "conint", "ContourIntegral", "oint", "Conint", "DoubleContourIntegral", "Cconint", 2472 "cwint", "cwconint", "ClockwiseContourIntegral", "cwconint", "awconint", "there4", "Therefore", 2473 "therefore", "because", "ratio", "Colon", "Proportion", "dotminus", "minusd", "mDDot", "homtht", "sim", 2474 "thicksim", "thksim", "Tilde", "backsim", "bsim", "ac", "mstpos", "acd", "VerticalTilde", "wr", "wreath", 2475 "NotTilde", "nsim", "eqsim", "EqualTilde", "esim", "sime", "simeq", "TildeEqual", "NotTildeEqual", "nsime", 2476 "nsimeq", "cong", "TildeFullEqual", "simne", "ncong", "NotTildeFullEqual", "ap", "approx", "asymp", 2477 "thickapprox", "thkap", "TildeTilde", "nap", "napprox", "NotTildeTilde", "ape", "approxeq", "apid", 2478 "backcong", "bcong", "asympeq", "CupCap", "bump", "Bumpeq", "HumpDownHump", "bumpe", "bumpeq", "HumpEqual", 2479 "doteq", "DotEqual", "esdot", "doteqdot", "eDot", "efDot", "fallingdotseq", "erDot", "risingdotseq", 2480 "Assign", "colone", "coloneq", "ecolon", "eqcolon", "ecir", "eqcirc", "circeq", "cire", "wedgeq", "veeeq", 2481 "triangleq", "trie", "equest", "questeq", "ne", "NotEqual", "Congruent", "equiv", "nequiv", "NotCongruent", 2482 "le", "leq", "ge", "geq", "GreaterEqual", "lE", "leqq", "LessFullEqual", "gE", "geqq", "GreaterFullEqual", 2483 "lnE", "lneqq", "gnE", "gneqq", "ll", "Lt", "NestedLessLess", "gg", "Gt", "NestedGreaterGreater", 2484 "between", "twixt", "NotCupCap", "nless", "nlt", "NotLess", "ngt", "ngtr", "NotGreater", "nle", "nleq", 2485 "NotLessEqual", "nge", "ngeq", "NotGreaterEqual", "lesssim", "LessTilde", "lsim", "GreaterTilde", "gsim", 2486 "gtrsim", "nlsim", "NotLessTilde", "ngsim", "NotGreaterTilde", "LessGreater", "lessgtr", "lg", "gl", 2487 "GreaterLess", "gtrless", "NotLessGreater", "ntlg", "NotGreaterLess", "ntgl", "pr", "prec", "Precedes", 2488 "sc", "succ", "Succeeds", "prcue", "preccurlyeq", "PrecedesSlantEqual", "sccue", "succcurlyeq", 2489 "SucceedsSlantEqual", "PrecedesTilde", "precsim", "prsim", "scsim", "SucceedsTilde", "succsim", 2490 "NotPrecedes", "npr", "nprec", "NotSucceeds", "nsc", "nsucc", "sub", "subset", "sup", "Superset", "supset", 2491 "nsub", "nsup", "sube", "subseteq", "SubsetEqual", "supe", "SupersetEqual", "supseteq", "NotSubsetEqual", 2492 "nsube", "nsubseteq", "NotSupersetEqual", "nsupe", "nsupseteq", "subne", "subsetneq", "supne", "supsetneq", 2493 "cupdot", "UnionPlus", "uplus", "sqsub", "sqsubset", "SquareSubset", "sqsup", "sqsupset", "SquareSuperset", 2494 "sqsube", "sqsubseteq", "SquareSubsetEqual", "sqsupe", "sqsupseteq", "SquareSupersetEqual", "sqcap", 2495 "SquareIntersection", "sqcup", "SquareUnion", "CirclePlus", "oplus", "CircleMinus", "ominus", 2496 "CircleTimes", "otimes", "osol", "CircleDot", "odot", "circledcirc", "ocir", "circledast", "oast", 2497 "circleddash", "odash", "boxplus", "plusb", "boxminus", "minusb", "boxtimes", "timesb", "dotsquare", 2498 "sdotb", "RightTee;", "vdash", "dashv", "LeftTee", "DownTee", "top", "bot", "bottom", "perp", "UpTee", 2499 "models", "DoubleRightTee", "vDash", "Vdash", "Vvdash", "VDash", "nvdash", "nvDash", "nVdash", "nVDash", 2500 "prurel", "LeftTriangle", "vartriangleleft", "vltri", "RightTriangle", "vartriangleright", "vrtri", 2501 "LeftTriangleEqual", "ltrie", "trianglelefteq", "RightTriangleEqual", "rtrie", "trianglerighteq", "origof", 2502 "imof", "multimap", "mumap", "hercon", "intcal", "intercal", "veebar", "barvee", "angrtvb", "lrtri", 2503 "bigwedge", "Wedge", "xwedge", "bigvee", "Vee", "xvee", "bigcap", "Intersection", "xcap", "bigcup", 2504 "Union", "xcup", "diam", "Diamond", "diamond", "sdot", "sstarf", "Star", "divideontimes", "divonx", 2505 "bowtie", "ltimes", "rtimes", "leftthreetimes", "lthree", "rightthreetimes", "rthree", "backsimeq", 2506 "bsime", "curlyvee", "cuvee", "curlywedge", "cuwed", "Sub", "Subset", "Sup", "Supset", "Cap", "Cup", 2507 "fork", "pitchfork", "epar", "lessdot", "ltdot", "gtdot", "gtrdot", "Ll", "Gg", "ggg", "leg", "lesseqgtr", 2508 "LessEqualGreater", "gel", "GreaterEqualLess", "gtreqless", "cuepr", "curlyeqprec", "cuesc", "curlyeqsucc", 2509 "NotPrecedesSlantEqual", "nprcue", "NotSucceedsSlantEqual", "nsccue", "NotSquareSubsetEqual", "nsqsube", 2510 "NotSquareSupersetEqual", "nsqsupe", "lnsim", "gnsim", "precnsim", "prnsim", "scnsim", "succnsim", "nltri", 2511 "NotLeftTriangle", "ntriangleleft", "NotRightTriangle", "nrtri", "ntriangleright", "nltrie", 2512 "NotLeftTriangleEqual", "ntrianglelefteq", "NotRightTriangleEqual", "nrtrie", "ntrianglerighteq", "vellip", 2513 "ctdot", "utdot", "dtdot", "disin", "isinsv", "isins", "isindot", "notinvc", "notinvb", "isinE", "nisd", 2514 "xnis", "nis", "notnivc", "notnivb", "barwedge", "doublebarwedge", "lceil", "LeftCeiling", "rceil", 2515 "RightCeiling", "LeftFloor", "lfloor", "rfloor", "RightFloor", "drcrop", "dlcrop", "urcrop", "ulcrop", 2516 "bnot", "profline", "profsurf", "telrec", "target", "ulcorn", "ulcorner", "urcorn", "urcorner", "dlcorn", 2517 "llcorner", "drcorn", "lrcorner", "frown", "sfrown", "smile", "ssmile", "cylcty", "profalar", "topbot", 2518 "ovbar", "solbar", "angzarr", "lmoust", "lmoustache", "rmoust", "rmoustache", "OverBracket", "tbrk", 2519 "bbrk", "UnderBracket", "bbrktbrk", "OverParenthesis", "UnderParenthesis", "OverBrace", "UnderBrace", 2520 "trpezium", "elinters", "blank", "circledS", "oS", "boxh", "HorizontalLine", "boxv", "boxdr", "boxdl", 2521 "boxur", "boxul", "boxvr", "boxvl", "boxhd", "boxhu", "boxvh", "boxH", "boxV", "boxdR", "boxDr", "boxDR", 2522 "boxdL", "boxDl", "boxDL", "boxuR", "boxUr", "boxUR", "boxuL", "boxUl", "boxUL", "boxvR", "boxVr", "boxVR", 2523 "boxvL", "boxVl", "boxVL", "boxHd", "boxhD", "boxHD", "boxHu", "boxhU", "boxHU", "boxvH", "boxVh", "boxVH", 2524 "uhblk", "lhblk", "block", "blk14", "blk12", "blk34", "squ", "Square", "square", "blacksquare", 2525 "FilledVerySmallSquare", "squarf", "squf", "EmptyVerySmallSquare", "rect", "marker", "fltns", 2526 "bigtriangleup", "xutri", "blacktriangle", "utrif", "triangle", "utri", "blacktriangleright", "rtrif", 2527 "rtri", "triangleright", "bigtriangledown", "xdtri", "blacktriangledown", "dtrif", "dtri", "triangledown", 2528 "blacktriangleleft", "ltrif", "ltri", "triangleleft", "loz", "lozenge", "cir", "tridot", "bigcirc", 2529 "xcirc", "ultri", "urtri", "lltri", "EmptySmallSquare", "FilledSmallSquare", "bigstar", "starf", "star", 2530 "phone", "female", "male", "spades", "spadesuit", "clubs", "clubsuit", "hearts", "heartsuit", 2531 "diamondsuit", "diams", "sung", "flat", "natur", "natural", "sharp", "check", "checkmark", "cross", "malt", 2532 "maltese", "sext", "VerticalSeparator", "lbbrk", "rbbrk", "bsolhsub", "suphsol", "LeftDoubleBracket", 2533 "lobrk", "RightDoubleBracket", "robrk", "lang", "langle", "LeftAngleBracket", "rang", "rangle", 2534 "RightAngleBracket", "Lang", "Rang", "loang", "roang", "LongLeftArrow", "longleftarrow", "xlarr", 2535 "LongRightArrow", "longrightarrow", "xrarr", "LongLeftRightArrow", "longleftrightarrow", "xharr", 2536 "DoubleLongLeftArrow", "Longleftarrow", "xlArr", "DoubleLongRightArrow", "Longrightarrow", "xrArr", 2537 "DoubleLongLeftRightArrow", "Longleftrightarrow", "xhArr", "longmapsto", "xmap", "dzigrarr", "nvlArr", 2538 "nvrArr", "nvHarr", "Map", "lbarr", "bkarow", "rbarr", "lBarr", "dbkarow", "rBarr", "drbkarow", "RBarr", 2539 "DDotrahd", "UpArrowBar", "DownArrowBar", "Rarrtl", "latail", "ratail", "lAtail", "rAtail", "larrfs", 2540 "rarrfs", "larrbfs", "rarrbfs", "nwarhk", "nearhk", "hksearow", "searhk", "hkswarow", "swarhk", "nwnear", 2541 "nesear", "toea", "seswar", "tosa", "swnwar", "rarrc", "cudarrr", "ldca", "rdca", "cudarrl", "larrpl", 2542 "curarrm", "cularrp", "rarrpl", "harrcir", "Uarrocir", "lurdshar", "ldrushar", "LeftRightVector", 2543 "RightUpDownVector", "DownLeftRightVector", "LeftUpDownVector", "LeftVectorBar", "RightVectorBar", 2544 "RightUpVectorBar", "RightDownVectorBar", "DownLeftVectorBar", "DownRightVectorBar", "LeftUpVectorBar", 2545 "LeftDownVectorBar", "LeftTeeVector", "RightTeeVector", "RightUpTeeVector", "RightDownTeeVector", 2546 "DownLeftTeeVector", "DownRightTeeVector", "LeftUpTeeVector", "LeftDownTeeVector", "lHar", "uHar", "rHar", 2547 "dHar", "luruhar", "ldrdhar", "ruluhar", "rdldhar", "lharul", "llhard", "rharul", "lrhard", "udhar", 2548 "UpEquilibrium", "duhar", "ReverseUpEquilibrium", "RoundImplies", "erarr", "simrarr", "larrsim", "rarrsim", 2549 "rarrap", "ltlarr", "gtrarr", "subrarr", "suplarr", "lfisht", "rfisht", "ufisht", "dfisht", "lopar", 2550 "ropar", "lbrke", "rbrke", "lbrkslu", "rbrksld", "lbrksld", "rbrkslu", "langd", "rangd", "lparlt", 2551 "rpargt", "gtlPar", "ltrPar", "vzigzag", "vangrt", "angrtvbd", "ange", "range", "dwangle", "uwangle", 2552 "angmsdaa", "angmsdab", "angmsdac", "angmsdad", "angmsdae", "angmsdaf", "angmsdag", "angmsdah", "bemptyv", 2553 "demptyv", "cemptyv", "raemptyv", "laemptyv", "ohbar", "omid", "opar", "operp", "olcross", "odsold", 2554 "olcir", "ofcir", "olt", "ogt", "cirscir", "cirE", "solb", "bsolb", "boxbox", "trisb", "rtriltri", 2555 "LeftTriangleBar", "RightTriangleBar", "iinfin", "infintie", "nvinfin", "eparsl", "smeparsl", "eqvparsl", 2556 "blacklozenge", "lozf", "RuleDelayed", "dsol", "bigodot", "xodot", "bigoplus", "xoplus", "bigotimes", 2557 "xotime", "biguplus", "xuplus", "bigsqcup", "xsqcup", "iiiint", "qint", "fpartint", "cirfnint", "awint", 2558 "rppolint", "scpolint", "npolint", "pointint", "quatint", "intlarhk", "pluscir", "plusacir", "simplus", 2559 "plusdu", "plussim", "plustwo", "mcomma", "minusdu", "loplus", "roplus", "Cross", "timesd", "timesbar", 2560 "smashp", "lotimes", "rotimes", "otimesas", "Otimes", "odiv", "triplus", "triminus", "tritime", "intprod", 2561 "iprod", "amalg", "capdot", "ncup", "ncap", "capand", "cupor", "cupcap", "capcup", "cupbrcap", "capbrcup", 2562 "cupcup", "capcap", "ccups", "ccaps", "ccupssm", "And", "Or", "andand", "oror", "orslope", "andslope", 2563 "andv", "orv", "andd", "ord", "wedbar", "sdote", "simdot", "congdot", "easter", "apacir", "apE", "eplus", 2564 "pluse", "Esim", "Colone", "Equal", "ddotseq", "eDDot", "equivDD", "ltcir", "gtcir", "ltquest", "gtquest", 2565 "leqslant", "les", "LessSlantEqual", "geqslant", "ges", "GreaterSlantEqual", "lesdot", "gesdot", "lesdoto", 2566 "gesdoto", "lesdotor", "gesdotol", "lap", "lessapprox", "gap", "gtrapprox", "lne", "lneq", "gne", "gneq", 2567 "lnap", "lnapprox", "gnap", "gnapprox", "lEg", "lesseqqgtr", "gEl", "gtreqqless", "lsime", "gsime", 2568 "lsimg", "gsiml", "lgE", "glE", "lesges", "gesles", "els", "eqslantless", "egs", "eqslantgtr", "elsdot", 2569 "egsdot", "el", "eg", "siml", "simg", "simlE", "simgE", "LessLess", "GreaterGreater", "glj", "gla", "ltcc", 2570 "gtcc", "lescc", "gescc", "smt", "lat", "smte", "late", "bumpE", "pre", "PrecedesEqual", "preceq", "sce", 2571 "SucceedsEqual", "succeq", "prE", "scE", "precneqq", "prnE", "scnE", "succneqq", "prap", "precapprox", 2572 "scap", "succapprox", "precnapprox", "prnap", "scnap", "succnapprox", "Pr", "Sc", "subdot", "supdot", 2573 "subplus", "supplus", "submult", "supmult", "subedot", "supedot", "subE", "subseteqq", "supE", "supseteqq", 2574 "subsim", "supsim", "subnE", "subsetneqq", "supnE", "supsetneqq", "csub", "csup", "csube", "csupe", 2575 "subsup", "supsub", "subsub", "supsup", "suphsub", "supdsub", "forkv", "topfork", "mlcp", "Dashv", 2576 "DoubleLeftTee", "Vdashl", "Barv", "vBar", "vBarv", "Vbar", "Not", "bNot", "rnmid", "cirmid", "midcir", 2577 "topcir", "nhpar", "parsim", "parsl", "fflig", "filig", "fllig", "ffilig", "ffllig", "Ascr", "Cscr", 2578 "Dscr", "Gscr", "Jscr", "Kscr", "Nscr", "Oscr", "Pscr", "Qscr", "Sscr", "Tscr", "Uscr", "Vscr", "Wscr", 2579 "Xscr", "Yscr", "Zscr", "ascr", "bscr", "cscr", "dscr", "fscr", "hscr", "iscr", "jscr", "kscr", "lscr", 2580 "mscr", "nscr", "pscr", "qscr", "rscr", "sscr", "tscr", "uscr", "vscr", "wscr", "xscr", "yscr", "zscr", 2581 "Afr", "Bfr", "Dfr", "Efr", "Ffr", "Gfr", "Jfr", "Kfr", "Lfr", "Mfr", "Nfr", "Ofr", "Pfr", "Qfr", "Sfr", 2582 "Tfr", "Ufr", "Vfr", "Wfr", "Xfr", "Yfr", "afr", "bfr", "cfr", "dfr", "efr", "ffr", "gfr", "hfr", "ifr", 2583 "jfr", "kfr", "lfr", "mfr", "nfr", "ofr", "pfr", "qfr", "rfr", "sfr", "tfr", "ufr", "vfr", "wfr", "xfr", 2584 "yfr", "zfr", "Aopf", "Bopf", "Dopf", "Eopf", "Fopf", "Gopf", "Iopf", "Jopf", "Kopf", "Lopf", "Mopf", 2585 "Oopf", "Sopf", "Topf", "Uopf", "Vopf", "Wopf", "Xopf", "Yopf", "aopf", "bopf", "copf", "dopf", "eopf", 2586 "fopf", "gopf", "hopf", "iopf", "jopf", "kopf", "lopf", "mopf", "nopf", "oopf", "popf", "qopf", "ropf", 2587 "sopf", "topf", "uopf", "vopf", "wopf", "xopf", "yopf", "zopf", "nvlt", "bne", "nvgt", "fjlig", 2588 "ThickSpace", "nrarrw", "npart", "nang", "caps", "cups", "nvsim", "race", "acE", "nesim", "NotEqualTilde", 2589 "napid", "nvap", "nbump", "NotHumpDownHump", "nbumpe", "NotHumpEqual", "nedot", "bnequiv", "nvle", "nvge", 2590 "nlE", "nleqq", "ngE", "ngeqq", "NotGreaterFullEqual", "lvertneqq", "lvnE", "gvertneqq", "gvnE", "nLtv", 2591 "NotLessLess", "nLt", "nGtv", "NotGreaterGreater", "nGt", "NotSucceedsTilde", "NotSubset", "nsubset", 2592 "vnsub", "NotSuperset", "nsupset", "vnsup", "varsubsetneq", "vsubne", "varsupsetneq", "vsupne", 2593 "NotSquareSubset", "NotSquareSuperset", "sqcaps", "sqcups", "nvltrie", "nvrtrie", "nLl", "nGg", "lesg", 2594 "gesl", "notindot", "notinE", "nrarrc", "NotLeftTriangleBar", "NotRightTriangleBar", "ncongdot", "napE", 2595 "nleqslant", "nles", "NotLessSlantEqual", "ngeqslant", "nges", "NotGreaterSlantEqual", "NotNestedLessLess", 2596 "NotNestedGreaterGreater", "smtes", "lates", "NotPrecedesEqual", "npre", "npreceq", "NotSucceedsEqual", 2597 "nsce", "nsucceq", "nsubE", "nsubseteqq", "nsupE", "nsupseteqq", "varsubsetneqq", "vsubnE", 2598 "varsupsetneqq", "vsupnE", "nparsl"}; 2599 private static final int[] CODEPOINTS = 2600 {33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 91, 92, 93, 93, 2601 94, 95, 95, 96, 96, 123, 123, 124, 124, 124, 125, 125, 160, 160, 161, 162, 163, 164, 165, 166, 167, 168, 2602 168, 168, 168, 169, 170, 171, 172, 173, 174, 174, 175, 175, 176, 177, 177, 178, 179, 180, 180, 181, 182, 2603 183, 183, 183, 184, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 197, 198, 2604 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 2605 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 2606 241, 242, 243, 244, 245, 246, 247, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 2607 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 278, 279, 280, 281, 282, 283, 2608 284, 285, 286, 287, 288, 289, 290, 292, 293, 294, 295, 296, 297, 298, 299, 302, 303, 304, 305, 305, 306, 2609 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 2610 328, 329, 330, 331, 332, 333, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 2611 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 2612 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 402, 437, 501, 567, 710, 711, 711, 728, 728, 729, 2613 729, 730, 731, 732, 732, 733, 733, 785, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 2614 926, 927, 928, 929, 931, 932, 933, 934, 935, 936, 937, 937, 945, 946, 947, 948, 949, 949, 950, 951, 952, 2615 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 962, 962, 963, 964, 965, 965, 966, 967, 968, 969, 977, 2616 977, 977, 978, 978, 981, 981, 981, 982, 982, 988, 989, 989, 1008, 1008, 1009, 1009, 1013, 1013, 1013, 1014, 2617 1014, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1038, 1039, 1040, 1041, 1042, 2618 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 2619 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 2620 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 2621 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 2622 1116, 1118, 1119, 8194, 8195, 8196, 8197, 8199, 8200, 8201, 8201, 8202, 8202, 8203, 8204, 8205, 8206, 8207, 2623 8208, 8208, 8211, 8212, 8213, 8214, 8214, 8216, 8216, 8217, 8217, 8217, 8218, 8218, 8220, 8220, 8221, 8221, 2624 8221, 8222, 8222, 8224, 8225, 8226, 8226, 8229, 8230, 8230, 8240, 8241, 8242, 8243, 8244, 8245, 8245, 8249, 2625 8250, 8254, 8254, 8257, 8259, 8260, 8271, 8279, 8287, 8288, 8289, 8289, 8290, 8290, 8291, 8291, 8364, 8411, 2626 8411, 8412, 8450, 8450, 8453, 8458, 8459, 8459, 8459, 8460, 8460, 8461, 8461, 8462, 8463, 8463, 8463, 8463, 2627 8464, 8464, 8465, 8465, 8465, 8465, 8466, 8466, 8466, 8467, 8469, 8469, 8470, 8471, 8472, 8472, 8473, 8473, 2628 8474, 8474, 8475, 8475, 8476, 8476, 8476, 8476, 8477, 8477, 8478, 8482, 8482, 8484, 8484, 8487, 8488, 8488, 2629 8489, 8492, 8492, 8492, 8493, 8493, 8495, 8496, 8496, 8497, 8497, 8499, 8499, 8499, 8500, 8500, 8500, 8501, 2630 8501, 8502, 8503, 8504, 8517, 8517, 8518, 8518, 8519, 8519, 8519, 8520, 8520, 8531, 8532, 8533, 8534, 8535, 2631 8536, 8537, 8538, 8539, 8540, 8541, 8542, 8592, 8592, 8592, 8592, 8592, 8593, 8593, 8593, 8593, 8594, 8594, 2632 8594, 8594, 8594, 8595, 8595, 8595, 8595, 8596, 8596, 8596, 8597, 8597, 8597, 8598, 8598, 8598, 8599, 8599, 2633 8599, 8600, 8600, 8600, 8601, 8601, 8601, 8602, 8602, 8603, 8603, 8605, 8605, 8606, 8606, 8607, 8608, 8608, 2634 8609, 8610, 8610, 8611, 8611, 8612, 8612, 8613, 8613, 8614, 8614, 8614, 8615, 8615, 8617, 8617, 8618, 8618, 2635 8619, 8619, 8620, 8620, 8621, 8621, 8622, 8622, 8624, 8624, 8625, 8625, 8626, 8627, 8629, 8630, 8630, 8631, 2636 8631, 8634, 8634, 8635, 8635, 8636, 8636, 8636, 8637, 8637, 8637, 8638, 8638, 8638, 8639, 8639, 8639, 8640, 2637 8640, 8640, 8641, 8641, 8641, 8642, 8642, 8642, 8643, 8643, 8643, 8644, 8644, 8644, 8645, 8645, 8646, 8646, 2638 8646, 8647, 8647, 8648, 8648, 8649, 8649, 8650, 8650, 8651, 8651, 8651, 8652, 8652, 8652, 8653, 8653, 8654, 2639 8654, 8655, 8655, 8656, 8656, 8656, 8657, 8657, 8657, 8658, 8658, 8658, 8658, 8659, 8659, 8659, 8660, 8660, 2640 8660, 8660, 8661, 8661, 8661, 8662, 8663, 8664, 8665, 8666, 8666, 8667, 8667, 8669, 8676, 8676, 8677, 8677, 2641 8693, 8693, 8701, 8702, 8703, 8704, 8704, 8705, 8705, 8706, 8706, 8707, 8707, 8708, 8708, 8708, 8709, 8709, 2642 8709, 8709, 8711, 8711, 8712, 8712, 8712, 8712, 8713, 8713, 8713, 8715, 8715, 8715, 8715, 8716, 8716, 8716, 2643 8719, 8719, 8720, 8720, 8721, 8721, 8722, 8723, 8723, 8723, 8724, 8724, 8726, 8726, 8726, 8726, 8726, 8727, 2644 8728, 8728, 8730, 8730, 8733, 8733, 8733, 8733, 8733, 8734, 8735, 8736, 8736, 8737, 8737, 8738, 8739, 8739, 2645 8739, 8739, 8740, 8740, 8740, 8740, 8741, 8741, 8741, 8741, 8741, 8742, 8742, 8742, 8742, 8742, 8743, 8743, 2646 8744, 8744, 8745, 8746, 8747, 8747, 8748, 8749, 8749, 8750, 8750, 8750, 8751, 8751, 8752, 8753, 8754, 8754, 2647 8754, 8755, 8756, 8756, 8756, 8757, 8758, 8759, 8759, 8760, 8760, 8762, 8763, 8764, 8764, 8764, 8764, 8765, 2648 8765, 8766, 8766, 8767, 8768, 8768, 8768, 8769, 8769, 8770, 8770, 8770, 8771, 8771, 8771, 8772, 8772, 8772, 2649 8773, 8773, 8774, 8775, 8775, 8776, 8776, 8776, 8776, 8776, 8776, 8777, 8777, 8777, 8778, 8778, 8779, 8780, 2650 8780, 8781, 8781, 8782, 8782, 8782, 8783, 8783, 8783, 8784, 8784, 8784, 8785, 8785, 8786, 8786, 8787, 8787, 2651 8788, 8788, 8788, 8789, 8789, 8790, 8790, 8791, 8791, 8793, 8794, 8796, 8796, 8799, 8799, 8800, 8800, 8801, 2652 8801, 8802, 8802, 8804, 8804, 8805, 8805, 8805, 8806, 8806, 8806, 8807, 8807, 8807, 8808, 8808, 8809, 8809, 2653 8810, 8810, 8810, 8811, 8811, 8811, 8812, 8812, 8813, 8814, 8814, 8814, 8815, 8815, 8815, 8816, 8816, 8816, 2654 8817, 8817, 8817, 8818, 8818, 8818, 8819, 8819, 8819, 8820, 8820, 8821, 8821, 8822, 8822, 8822, 8823, 8823, 2655 8823, 8824, 8824, 8825, 8825, 8826, 8826, 8826, 8827, 8827, 8827, 8828, 8828, 8828, 8829, 8829, 8829, 8830, 2656 8830, 8830, 8831, 8831, 8831, 8832, 8832, 8832, 8833, 8833, 8833, 8834, 8834, 8835, 8835, 8835, 8836, 8837, 2657 8838, 8838, 8838, 8839, 8839, 8839, 8840, 8840, 8840, 8841, 8841, 8841, 8842, 8842, 8843, 8843, 8845, 8846, 2658 8846, 8847, 8847, 8847, 8848, 8848, 8848, 8849, 8849, 8849, 8850, 8850, 8850, 8851, 8851, 8852, 8852, 8853, 2659 8853, 8854, 8854, 8855, 8855, 8856, 8857, 8857, 8858, 8858, 8859, 8859, 8861, 8861, 8862, 8862, 8863, 8863, 2660 8864, 8864, 8865, 8865, 8866, 8866, 8867, 8867, 8868, 8868, 8869, 8869, 8869, 8869, 8871, 8872, 8872, 8873, 2661 8874, 8875, 8876, 8877, 8878, 8879, 8880, 8882, 8882, 8882, 8883, 8883, 8883, 8884, 8884, 8884, 8885, 8885, 2662 8885, 8886, 8887, 8888, 8888, 8889, 8890, 8890, 8891, 8893, 8894, 8895, 8896, 8896, 8896, 8897, 8897, 8897, 2663 8898, 8898, 8898, 8899, 8899, 8899, 8900, 8900, 8900, 8901, 8902, 8902, 8903, 8903, 8904, 8905, 8906, 8907, 2664 8907, 8908, 8908, 8909, 8909, 8910, 8910, 8911, 8911, 8912, 8912, 8913, 8913, 8914, 8915, 8916, 8916, 8917, 2665 8918, 8918, 8919, 8919, 8920, 8921, 8921, 8922, 8922, 8922, 8923, 8923, 8923, 8926, 8926, 8927, 8927, 8928, 2666 8928, 8929, 8929, 8930, 8930, 8931, 8931, 8934, 8935, 8936, 8936, 8937, 8937, 8938, 8938, 8938, 8939, 8939, 2667 8939, 8940, 8940, 8940, 8941, 8941, 8941, 8942, 8943, 8944, 8945, 8946, 8947, 8948, 8949, 8950, 8951, 8953, 2668 8954, 8955, 8956, 8957, 8958, 8965, 8966, 8968, 8968, 8969, 8969, 8970, 8970, 8971, 8971, 8972, 8973, 8974, 2669 8975, 8976, 8978, 8979, 8981, 8982, 8988, 8988, 8989, 8989, 8990, 8990, 8991, 8991, 8994, 8994, 8995, 8995, 2670 9005, 9006, 9014, 9021, 9023, 9084, 9136, 9136, 9137, 9137, 9140, 9140, 9141, 9141, 9142, 9180, 9181, 9182, 2671 9183, 9186, 9191, 9251, 9416, 9416, 9472, 9472, 9474, 9484, 9488, 9492, 9496, 9500, 9508, 9516, 9524, 9532, 2672 9552, 9553, 9554, 9555, 9556, 9557, 9558, 9559, 9560, 9561, 9562, 9563, 9564, 9565, 9566, 9567, 9568, 9569, 2673 9570, 9571, 9572, 9573, 9574, 9575, 9576, 9577, 9578, 9579, 9580, 9600, 9604, 9608, 9617, 9618, 9619, 9633, 2674 9633, 9633, 9642, 9642, 9642, 9642, 9643, 9645, 9646, 9649, 9651, 9651, 9652, 9652, 9653, 9653, 9656, 9656, 2675 9657, 9657, 9661, 9661, 9662, 9662, 9663, 9663, 9666, 9666, 9667, 9667, 9674, 9674, 9675, 9708, 9711, 9711, 2676 9720, 9721, 9722, 9723, 9724, 9733, 9733, 9734, 9742, 9792, 9794, 9824, 9824, 9827, 9827, 9829, 9829, 9830, 2677 9830, 9834, 9837, 9838, 9838, 9839, 10003, 10003, 10007, 10016, 10016, 10038, 10072, 10098, 10099, 10184, 2678 10185, 10214, 10214, 10215, 10215, 10216, 10216, 10216, 10217, 10217, 10217, 10218, 10219, 10220, 10221, 2679 10229, 10229, 10229, 10230, 10230, 10230, 10231, 10231, 10231, 10232, 10232, 10232, 10233, 10233, 10233, 2680 10234, 10234, 10234, 10236, 10236, 10239, 10498, 10499, 10500, 10501, 10508, 10509, 10509, 10510, 10511, 2681 10511, 10512, 10512, 10513, 10514, 10515, 10518, 10521, 10522, 10523, 10524, 10525, 10526, 10527, 10528, 2682 10531, 10532, 10533, 10533, 10534, 10534, 10535, 10536, 10536, 10537, 10537, 10538, 10547, 10549, 10550, 2683 10551, 10552, 10553, 10556, 10557, 10565, 10568, 10569, 10570, 10571, 10574, 10575, 10576, 10577, 10578, 2684 10579, 10580, 10581, 10582, 10583, 10584, 10585, 10586, 10587, 10588, 10589, 10590, 10591, 10592, 10593, 2685 10594, 10595, 10596, 10597, 10598, 10599, 10600, 10601, 10602, 10603, 10604, 10605, 10606, 10606, 10607, 2686 10607, 10608, 10609, 10610, 10611, 10612, 10613, 10614, 10616, 10617, 10619, 10620, 10621, 10622, 10623, 2687 10629, 10630, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, 10650, 2688 10652, 10653, 10660, 10661, 10662, 10663, 10664, 10665, 10666, 10667, 10668, 10669, 10670, 10671, 10672, 2689 10673, 10674, 10675, 10676, 10677, 10678, 10679, 10681, 10683, 10684, 10686, 10687, 10688, 10689, 10690, 2690 10691, 10692, 10693, 10697, 10701, 10702, 10703, 10704, 10716, 10717, 10718, 10723, 10724, 10725, 10731, 2691 10731, 10740, 10742, 10752, 10752, 10753, 10753, 10754, 10754, 10756, 10756, 10758, 10758, 10764, 10764, 2692 10765, 10768, 10769, 10770, 10771, 10772, 10773, 10774, 10775, 10786, 10787, 10788, 10789, 10790, 10791, 2693 10793, 10794, 10797, 10798, 10799, 10800, 10801, 10803, 10804, 10805, 10806, 10807, 10808, 10809, 10810, 2694 10811, 10812, 10812, 10815, 10816, 10818, 10819, 10820, 10821, 10822, 10823, 10824, 10825, 10826, 10827, 2695 10828, 10829, 10832, 10835, 10836, 10837, 10838, 10839, 10840, 10842, 10843, 10844, 10845, 10847, 10854, 2696 10858, 10861, 10862, 10863, 10864, 10865, 10866, 10867, 10868, 10869, 10871, 10871, 10872, 10873, 10874, 2697 10875, 10876, 10877, 10877, 10877, 10878, 10878, 10878, 10879, 10880, 10881, 10882, 10883, 10884, 10885, 2698 10885, 10886, 10886, 10887, 10887, 10888, 10888, 10889, 10889, 10890, 10890, 10891, 10891, 10892, 10892, 2699 10893, 10894, 10895, 10896, 10897, 10898, 10899, 10900, 10901, 10901, 10902, 10902, 10903, 10904, 10905, 2700 10906, 10909, 10910, 10911, 10912, 10913, 10914, 10916, 10917, 10918, 10919, 10920, 10921, 10922, 10923, 2701 10924, 10925, 10926, 10927, 10927, 10927, 10928, 10928, 10928, 10931, 10932, 10933, 10933, 10934, 10934, 2702 10935, 10935, 10936, 10936, 10937, 10937, 10938, 10938, 10939, 10940, 10941, 10942, 10943, 10944, 10945, 2703 10946, 10947, 10948, 10949, 10949, 10950, 10950, 10951, 10952, 10955, 10955, 10956, 10956, 10959, 10960, 2704 10961, 10962, 10963, 10964, 10965, 10966, 10967, 10968, 10969, 10970, 10971, 10980, 10980, 10982, 10983, 2705 10984, 10985, 10987, 10988, 10989, 10990, 10991, 10992, 10993, 10994, 10995, 11005, 64256, 64257, 64258, 2706 64259, 64260, 119964, 119966, 119967, 119970, 119973, 119974, 119977, 119978, 119979, 119980, 119982, 2707 119983, 119984, 119985, 119986, 119987, 119988, 119989, 119990, 119991, 119992, 119993, 119995, 119997, 2708 119998, 119999, 120000, 120001, 120002, 120003, 120005, 120006, 120007, 120008, 120009, 120010, 120011, 2709 120012, 120013, 120014, 120015, 120068, 120069, 120071, 120072, 120073, 120074, 120077, 120078, 120079, 2710 120080, 120081, 120082, 120083, 120084, 120086, 120087, 120088, 120089, 120090, 120091, 120092, 120094, 2711 120095, 120096, 120097, 120098, 120099, 120100, 120101, 120102, 120103, 120104, 120105, 120106, 120107, 2712 120108, 120109, 120110, 120111, 120112, 120113, 120114, 120115, 120116, 120117, 120118, 120119, 120120, 2713 120121, 120123, 120124, 120125, 120126, 120128, 120129, 120130, 120131, 120132, 120134, 120138, 120139, 2714 120140, 120141, 120142, 120143, 120144, 120146, 120147, 120148, 120149, 120150, 120151, 120152, 120153, 2715 120154, 120155, 120156, 120157, 120158, 120159, 120160, 120161, 120162, 120163, 120164, 120165, 120166, 2716 120167, 120168, 120169, 120170, 120171}; 2717 private static final long[] COMBINED_DIACRITICALS = 2718 {0x003C020D2L, 0x003D020E5L, 0x003E020D2L, 0x00660006AL, 0x205F0200AL, 0x219D00338L, 0x220200338L, 2719 0x2220020D2L, 0x22290FE00L, 0x222A0FE00L, 0x223C020D2L, 0x223D00331L, 0x223E00333L, 0x224200338L, 2720 0x224200338L, 0x224B00338L, 0x224D020D2L, 0x224E00338L, 0x224E00338L, 0x224F00338L, 0x224F00338L, 2721 0x225000338L, 0x2261020E5L, 0x2264020D2L, 0x2265020D2L, 0x226600338L, 0x226600338L, 0x226700338L, 2722 0x226700338L, 0x226700338L, 0x22680FE00L, 0x22680FE00L, 0x22690FE00L, 0x22690FE00L, 0x226A00338L, 2723 0x226A00338L, 0x226A020D2L, 0x226B00338L, 0x226B00338L, 0x226B020D2L, 0x227F00338L, 0x2282020D2L, 2724 0x2282020D2L, 0x2282020D2L, 0x2283020D2L, 0x2283020D2L, 0x2283020D2L, 0x228A0FE00L, 0x228A0FE00L, 2725 0x228B0FE00L, 0x228B0FE00L, 0x228F00338L, 0x229000338L, 0x22930FE00L, 0x22940FE00L, 0x22B4020D2L, 2726 0x22B5020D2L, 0x22D800338L, 0x22D900338L, 0x22DA0FE00L, 0x22DB0FE00L, 0x22F500338L, 0x22F900338L, 2727 0x293300338L, 0x29CF00338L, 0x29D000338L, 0x2A6D00338L, 0x2A7000338L, 0x2A7D00338L, 0x2A7D00338L, 2728 0x2A7D00338L, 0x2A7E00338L, 0x2A7E00338L, 0x2A7E00338L, 0x2AA100338L, 0x2AA200338L, 0x2AAC0FE00L, 2729 0x2AAD0FE00L, 0x2AAF00338L, 0x2AAF00338L, 0x2AAF00338L, 0x2AB000338L, 0x2AB000338L, 0x2AB000338L, 2730 0x2AC500338L, 0x2AC500338L, 0x2AC600338L, 0x2AC600338L, 0x2ACB0FE00L, 0x2ACB0FE00L, 0x2ACC0FE00L, 2731 0x2ACC0FE00L, 0x2AFD020E5L}; 2732 2733 private static final int MIN_ESCAPE; 2734 private static final int MAX_ESCAPE; 2735 private static final HashMap<String, int[]> LOOKUP_MAP; 2736 2737 static { 2738 int minEscape = Integer.MAX_VALUE; 2739 int maxEscape = Integer.MIN_VALUE; 2740 HashMap<String, int[]> lookupMap = new HashMap<>(NAMES.length); 2741 2742 for (String name : NAMES) { 2743 minEscape = Math.min(minEscape, name.length()); 2744 maxEscape = Math.max(maxEscape, name.length()); 2745 } 2746 2747 for (int i = 0; i < CODEPOINTS.length; i++) 2748 lookupMap.put(NAMES[i], new int[]{CODEPOINTS[i]}); 2749 2750 for (int i = 0; i < COMBINED_DIACRITICALS.length; i++) { 2751 long combinedDiacritical = COMBINED_DIACRITICALS[i]; 2752 int codepoint1 = (int) (combinedDiacritical >> 20); 2753 int codepoint2 = (int) (combinedDiacritical & 0xFFFFF); 2754 lookupMap.put(NAMES[CODEPOINTS.length + i], new int[]{codepoint1, codepoint2}); 2755 } 2756 2757 MIN_ESCAPE = minEscape; 2758 MAX_ESCAPE = maxEscape; 2759 LOOKUP_MAP = lookupMap; 2760 } 2761 2762 public static String unescapeHtml(String input) { 2763 StringBuilder result = null; 2764 2765 int len = input.length(); 2766 int start = 0; 2767 int escStart = 0; 2768 while (true) { 2769 // Look for '&' 2770 while (escStart < len && input.charAt(escStart) != '&') 2771 escStart++; 2772 2773 if (escStart == len) 2774 break; 2775 2776 escStart++; 2777 2778 // Found '&'. Look for ';' 2779 int escEnd = escStart; 2780 while (escEnd < len && escEnd - escStart < MAX_ESCAPE + 1 && input.charAt(escEnd) != ';') 2781 escEnd++; 2782 2783 if (escEnd == len) 2784 break; 2785 2786 // Bail if this is not a potential HTML entity. 2787 if (escEnd - escStart < MIN_ESCAPE || escEnd - escStart == MAX_ESCAPE + 1) { 2788 escStart++; 2789 continue; 2790 } 2791 2792 // Check the kind of entity 2793 if (input.charAt(escStart) == '#') { 2794 // Numeric entity 2795 int numStart = escStart + 1; 2796 int radix; 2797 2798 char firstChar = input.charAt(numStart); 2799 if (firstChar == 'x' || firstChar == 'X') { 2800 numStart++; 2801 radix = 16; 2802 } else { 2803 radix = 10; 2804 } 2805 2806 try { 2807 int entityValue = Integer.parseInt(input.substring(numStart, escEnd), radix); 2808 2809 if (result == null) 2810 result = new StringBuilder(input.length()); 2811 2812 result.append(input, start, escStart - 1); 2813 2814 if (entityValue > 0xFFFF) 2815 result.append(Character.toChars(entityValue)); 2816 else 2817 result.append((char) entityValue); 2818 } catch (NumberFormatException ignored) { 2819 escStart++; 2820 continue; 2821 } 2822 } else { 2823 // Named entity 2824 int[] codePoints = LOOKUP_MAP.get(input.substring(escStart, escEnd)); 2825 if (codePoints == null) { 2826 escStart++; 2827 continue; 2828 } 2829 2830 if (result == null) 2831 result = new StringBuilder(input.length()); 2832 2833 result.append(input, start, escStart - 1); 2834 for (int codePoint : codePoints) 2835 result.appendCodePoint(codePoint); 2836 } 2837 2838 // Skip escape 2839 start = escEnd + 1; 2840 escStart = start; 2841 } 2842 2843 if (result != null) { 2844 result.append(input, start, len); 2845 return result.toString(); 2846 } 2847 2848 return input; 2849 } 2850 } 2851 2852 // *** END HTML-Unescaper source *** 2853}