Source code

001/*
002 * Copyright 2022-2025 Revetware LLC.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Some of the code below is sourced from the Apache Tomcat fork of Apache commons-fileupload.
017 * See https://github.com/apache/tomcat for the original.
018 * It is also licensed under the terms of the Apache License, Version 2.0.
019 */
020
021package com.soklet.core.impl;
022
023import com.soklet.core.MultipartField;
024import com.soklet.core.MultipartParser;
025import com.soklet.core.Request;
026import com.soklet.core.Utilities;
027import com.soklet.exception.MissingRequestHeaderException;
028import com.soklet.internal.spring.LinkedCaseInsensitiveMap;
029
030import javax.annotation.Nonnull;
031import javax.annotation.concurrent.ThreadSafe;
032import java.io.ByteArrayInputStream;
033import java.io.ByteArrayOutputStream;
034import java.io.Closeable;
035import java.io.IOException;
036import java.io.InputStream;
037import java.io.OutputStream;
038import java.io.UncheckedIOException;
039import java.io.UnsupportedEncodingException;
040import java.nio.charset.Charset;
041import java.nio.charset.StandardCharsets;
042import java.text.ParseException;
043import java.util.Base64;
044import java.util.HashMap;
045import java.util.Iterator;
046import java.util.LinkedHashMap;
047import java.util.LinkedHashSet;
048import java.util.Locale;
049import java.util.Map;
050import java.util.Set;
051
052import static com.soklet.core.Utilities.trimAggressivelyToNull;
053
054/**
055 * @author <a href="https://www.revetkn.com">Mark Allen</a>
056 */
057@ThreadSafe
058public class DefaultMultipartParser implements MultipartParser {
059        @Nonnull
060        private static final DefaultMultipartParser SHARED_INSTANCE;
061
062        static {
063                SHARED_INSTANCE = new DefaultMultipartParser();
064        }
065
066        @Nonnull
067        public static DefaultMultipartParser sharedInstance() {
068                return SHARED_INSTANCE;
069        }
070
071        @Override
072        @Nonnull
073        public Map<String, Set<MultipartField>> extractMultipartFields(@Nonnull Request request) {
074                byte[] requestBody = request.getBody().orElse(null);
075
076                if (requestBody == null)
077                        return Map.of();
078
079                // Required for embedded commons-upload code
080                MultipartStream.ProgressNotifier progressNotifier = new MultipartStream.ProgressNotifier(new ProgressListener() {
081                        @Override
082                        public void update(long bytesRead, long contentLength, int items) {
083                                // Ignored for now
084                        }
085                }, requestBody.length) {
086                        @Override
087                        void noteBytesRead(int pBytes) {
088                                // Ignored for now
089                        }
090
091                        @Override
092                        public void noteItem() {
093                                // Ignored for now
094                        }
095                };
096
097                String contentTypeHeader = request.getHeader("Content-Type").orElse(null);
098
099                if (contentTypeHeader == null)
100                        throw new MissingRequestHeaderException("The 'Content-Type' header must be specified for multipart requests.", "Content-Type");
101
102                Map<String, String> contentTypeHeaderFields = extractFields(contentTypeHeader);
103                Map<String, Set<MultipartField>> multipartFieldsByName = new LinkedHashMap<>();
104
105                try (ByteArrayInputStream input = new ByteArrayInputStream(requestBody)) {
106                        MultipartStream multipartStream = new MultipartStream(input, contentTypeHeaderFields.get("boundary").getBytes(), progressNotifier);
107
108                        boolean hasNext = multipartStream.skipPreamble();
109
110                        while (hasNext) {
111                                // Example headers:
112                                //
113                                // Content-Disposition: form-data; name="doc"; filename="test.pdf"
114                                // Content-Type: application/pdf
115                                // Use a case-insensitive map for simplified lookups
116                                Map<String, String> headers = splitHeaders(multipartStream.readHeaders());
117                                String contentDisposition = trimAggressivelyToNull(headers.get("Content-Disposition"));
118                                Map<String, String> contentDispositionFields = Map.of();
119
120                                if (contentDisposition != null)
121                                        contentDispositionFields = new ParameterParser().parse(contentDisposition, ';');
122
123                                String name = trimAggressivelyToNull(contentDispositionFields.get("name"));
124
125                                if (name == null)
126                                        continue;
127
128                                ByteArrayOutputStream data = new ByteArrayOutputStream();
129                                multipartStream.readBodyData(data);
130
131                                String filename = trimAggressivelyToNull(contentDispositionFields.get("filename"));
132
133                                // For example:
134                                // "Screenshot-1.53.26&#8239;PM.png"
135                                // becomes
136                                // "Screenshot-1.53.26 PM.png"
137                                if (filename != null)
138                                        filename = HTMLUtilities.unescapeHtml(filename);
139
140                                String contentTypeHeaderValue = trimAggressivelyToNull(headers.get("Content-Type"));
141                                String contentType = Utilities.extractContentTypeFromHeaderValue(contentTypeHeaderValue).orElse(null);
142                                Charset charset = Utilities.extractCharsetFromHeaderValue(contentTypeHeaderValue).orElse(null);
143
144                                MultipartField multipartField = MultipartField.with(name, data.toByteArray())
145                                                .filename(filename)
146                                                .contentType(contentType)
147                                                .charset(charset)
148                                                .build();
149
150                                Set<MultipartField> multipartFields = multipartFieldsByName.get(name);
151
152                                if (multipartFields == null) {
153                                        multipartFields = new LinkedHashSet<>();
154                                        multipartFieldsByName.put(name, multipartFields);
155                                }
156
157                                multipartFields.add(multipartField);
158
159                                hasNext = multipartStream.readBoundary();
160                        }
161                } catch (IOException e) {
162                        throw new UncheckedIOException(e);
163                }
164
165                return multipartFieldsByName;
166        }
167
168        // The code below is sourced from Selenium.
169        // It is licensed under the terms of the Apache License, Version 2.0.
170        // The license text for all of the below code is as follows:
171
172        /*
173                Copyright 2012 Selenium committers
174                Copyright 2012 Software Freedom Conservancy
175
176                Licensed under the Apache License, Version 2.0 (the "License");
177                you may not use this file except in compliance with the License.
178                You may obtain a copy of the License at
179
180                 http://www.apache.org/licenses/LICENSE-2.0
181
182                Unless required by applicable law or agreed to in writing, software
183                distributed under the License is distributed on an "AS IS" BASIS,
184                WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
185                See the License for the specific language governing permissions and
186                limitations under the License.
187        */
188
189        // *** START Selenium UploadFileHandler source ***
190
191        protected LinkedCaseInsensitiveMap<String> splitHeaders(String readHeaders) {
192                LinkedCaseInsensitiveMap<String> headersBuilder = new LinkedCaseInsensitiveMap<>();
193                String[] headers = readHeaders.split("\r\n");
194                for (String headerLine : headers) {
195                        int index = headerLine.indexOf(':');
196                        if (index < 0) {
197                                continue;
198                        }
199                        String key = headerLine.substring(0, index);
200                        String value = headerLine.substring(index + 1).trim();
201                        headersBuilder.put(key, value);
202                }
203                return headersBuilder;
204        }
205
206        protected LinkedCaseInsensitiveMap<String> extractFields(String contentTypeHeader) {
207                LinkedCaseInsensitiveMap<String> fieldsBuilder = new LinkedCaseInsensitiveMap<>();
208                String[] contentTypeHeaderParts = contentTypeHeader.split("[;,]");
209                for (String contentTypeHeaderPart : contentTypeHeaderParts) {
210                        String[] kv = contentTypeHeaderPart.split("=");
211                        if (kv.length == 2) {
212                                fieldsBuilder.put(kv[0].trim().toLowerCase(Locale.US), kv[1].trim());
213                        }
214                }
215                return fieldsBuilder;
216        }
217
218        // *** END Selenium UploadFileHandler source ***
219
220        // The code below is sourced from the Apache Tomcat fork of Apache commons-fileupload.
221        // See https://github.com/apache/tomcat for the original.
222        // It is licensed under the terms of the Apache License, Version 2.0.
223        // The license text for all of the below code is as follows:
224
225        /*
226         * Licensed to the Apache Software Foundation (ASF) under one or more
227         * contributor license agreements.  See the NOTICE file distributed with
228         * this work for additional information regarding copyright ownership.
229         * The ASF licenses this file to You under the Apache License, Version 2.0
230         * (the "License"); you may not use this file except in compliance with
231         * the License.  You may obtain a copy of the License at
232         *
233         *      http://www.apache.org/licenses/LICENSE-2.0
234         *
235         * Unless required by applicable law or agreed to in writing, software
236         * distributed under the License is distributed on an "AS IS" BASIS,
237         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
238         * See the License for the specific language governing permissions and
239         * limitations under the License.
240         */
241
242        // *** START commons-fileupload source ***
243
244        /**
245         * Receives progress information. May be used to display a progress bar.
246         */
247        @FunctionalInterface
248        protected interface ProgressListener {
249
250                /**
251                 * Nop implementation.
252                 */
253                ProgressListener NOP = (bytesRead, contentLength, items) -> {
254                        // nop
255                };
256
257                /**
258                 * Updates the listeners status information.
259                 *
260                 * @param bytesRead     The total number of bytes, which have been read so far.
261                 * @param contentLength The total number of bytes, which are being read. May be -1, if this number is unknown.
262                 * @param items         The number of the field, which is currently being read. (0 = no item so far, 1 = first item is being read, ...)
263                 */
264                void update(long bytesRead, long contentLength, int items);
265
266        }
267
268        /**
269         * Exception for errors encountered while processing the request.
270         */
271        protected static class FileUploadException extends IOException {
272
273                private static final long serialVersionUID = -4222909057964038517L;
274
275                /**
276                 * Constructs a new {@code FileUploadException} without message.
277                 */
278                public FileUploadException() {
279                        super();
280                }
281
282                /**
283                 * Constructs a new {@code FileUploadException} with specified detail
284                 * message.
285                 *
286                 * @param msg the error message.
287                 */
288                public FileUploadException(final String msg) {
289                        super(msg);
290                }
291
292                /**
293                 * Creates a new {@code FileUploadException} with the given
294                 * detail message and cause.
295                 *
296                 * @param msg   The exceptions detail message.
297                 * @param cause The exceptions cause.
298                 */
299                public FileUploadException(final String msg, final Throwable cause) {
300                        super(msg, cause);
301                }
302        }
303
304        /**
305         * This exception is thrown for hiding an inner
306         * {@link FileUploadException} in an {@link IOException}.
307         */
308        protected static class FileUploadIOException extends IOException {
309
310                /**
311                 * The exceptions UID, for serializing an instance.
312                 */
313                private static final long serialVersionUID = -7047616958165584154L;
314
315                /**
316                 * The exceptions cause; we overwrite the parent
317                 * classes field, which is available since Java
318                 * 1.4 only.
319                 */
320                private final FileUploadException cause;
321
322                /**
323                 * Creates a {@code FileUploadIOException} with the
324                 * given cause.
325                 *
326                 * @param pCause The exceptions cause, if any, or null.
327                 */
328                public FileUploadIOException(final FileUploadException pCause) {
329                        // We're not doing super(pCause) cause of 1.3 compatibility.
330                        cause = pCause;
331                }
332
333                /**
334                 * Returns the exceptions cause.
335                 *
336                 * @return The exceptions cause, if any, or null.
337                 */
338                @SuppressWarnings("sync-override") // Field is final
339                @Override
340                public Throwable getCause() {
341                        return cause;
342                }
343
344        }
345
346        /**
347         * <p> This class provides support for accessing the headers for a file or form
348         * item that was received within a {@code multipart/form-data} POST
349         * request.</p>
350         *
351         * @since 1.2.1
352         */
353        protected interface FileItemHeaders {
354
355                /**
356                 * Returns the value of the specified part header as a {@code String}.
357                 * <p>
358                 * If the part did not include a header of the specified name, this method
359                 * return {@code null}.  If there are multiple headers with the same
360                 * name, this method returns the first header in the item.  The header
361                 * name is case insensitive.
362                 *
363                 * @param name a {@code String} specifying the header name
364                 * @return a {@code String} containing the value of the requested
365                 * header, or {@code null} if the item does not have a header
366                 * of that name
367                 */
368                String getHeader(String name);
369
370                /**
371                 * <p>
372                 * Returns all the values of the specified item header as an
373                 * {@code Iterator} of {@code String} objects.
374                 * </p>
375                 * <p>
376                 * If the item did not include any headers of the specified name, this
377                 * method returns an empty {@code Iterator}. The header name is
378                 * case insensitive.
379                 * </p>
380                 *
381                 * @param name a {@code String} specifying the header name
382                 * @return an {@code Iterator} containing the values of the
383                 * requested header. If the item does not have any headers of
384                 * that name, return an empty {@code Iterator}
385                 */
386                Iterator<String> getHeaders(String name);
387
388                /**
389                 * <p>
390                 * Returns an {@code Iterator} of all the header names.
391                 * </p>
392                 *
393                 * @return an {@code Iterator} containing all of the names of
394                 * headers provided with this file item. If the item does not have
395                 * any headers return an empty {@code Iterator}
396                 */
397                Iterator<String> getHeaderNames();
398
399        }
400
401        /**
402         * Interface that will indicate that FileItem or FileItemStream
403         * implementations will accept the headers read for the item.
404         *
405         * @see FileItemStream
406         * @since 1.2.1
407         */
408        protected interface FileItemHeadersSupport {
409
410                /**
411                 * Returns the collection of headers defined locally within this item.
412                 *
413                 * @return the {@link FileItemHeaders} present for this item.
414                 */
415                FileItemHeaders getHeaders();
416
417                /**
418                 * Sets the headers read from within an item.  Implementations of
419                 * FileItem or FileItemStream should implement this
420                 * interface to be able to get the raw headers found within the item
421                 * header block.
422                 *
423                 * @param headers the instance that holds onto the headers
424                 *                for this instance.
425                 */
426                void setHeaders(FileItemHeaders headers);
427
428        }
429
430        /**
431         * <p> This interface provides access to a file or form item that was
432         * received within a {@code multipart/form-data} POST request.
433         * The items contents are retrieved by calling {@link #openStream()}.</p>
434         * <p>Instances of this class are created by accessing the
435         * iterator, returned by
436         * FileUploadBase#getItemIterator(RequestContext).</p>
437         * <p><em>Note</em>: There is an interaction between the iterator and
438         * its associated instances of {@link FileItemStream}: By invoking
439         * {@link java.util.Iterator#hasNext()} on the iterator, you discard all data,
440         * which hasn't been read so far from the previous data.</p>
441         */
442        protected interface FileItemStream extends FileItemHeadersSupport {
443
444                /**
445                 * This exception is thrown, if an attempt is made to read
446                 * data from the {@link InputStream}, which has been returned
447                 * by {@link FileItemStream#openStream()}, after
448                 * {@link java.util.Iterator#hasNext()} has been invoked on the
449                 * iterator, which created the {@link FileItemStream}.
450                 */
451                class ItemSkippedException extends IOException {
452
453                        /**
454                         * The exceptions serial version UID, which is being used
455                         * when serializing an exception instance.
456                         */
457                        private static final long serialVersionUID = -7280778431581963740L;
458
459                }
460
461                /**
462                 * Creates an {@link InputStream}, which allows to read the
463                 * items contents.
464                 *
465                 * @return The input stream, from which the items data may
466                 * be read.
467                 * @throws IllegalStateException The method was already invoked on
468                 *                               this item. It is not possible to recreate the data stream.
469                 * @throws IOException           An I/O error occurred.
470                 * @see ItemSkippedException
471                 */
472                InputStream openStream() throws IOException;
473
474                /**
475                 * Returns the content type passed by the browser or {@code null} if
476                 * not defined.
477                 *
478                 * @return The content type passed by the browser or {@code null} if
479                 * not defined.
480                 */
481                String getContentType();
482
483                /**
484                 * Returns the original file name in the client's file system, as provided by
485                 * the browser (or other client software). In most cases, this will be the
486                 * base file name, without path information. However, some clients, such as
487                 * the Opera browser, do include path information.
488                 *
489                 * @return The original file name in the client's file system.
490                 */
491                String getName();
492
493                /**
494                 * Returns the name of the field in the multipart form corresponding to
495                 * this file item.
496                 *
497                 * @return The name of the form field.
498                 */
499                String getFieldName();
500
501                /**
502                 * Determines whether or not a {@code FileItem} instance represents
503                 * a simple form field.
504                 *
505                 * @return {@code true} if the instance represents a simple form
506                 * field; {@code false} if it represents an uploaded file.
507                 */
508                boolean isFormField();
509
510        }
511
512        /**
513         * This exception is thrown in case of an invalid file name.
514         * A file name is invalid, if it contains a NUL character.
515         * Attackers might use this to circumvent security checks:
516         * For example, a malicious user might upload a file with the name
517         * "foo.exe\0.png". This file name might pass security checks (i.e.
518         * checks for the extension ".png"), while, depending on the underlying
519         * C library, it might create a file named "foo.exe", as the NUL
520         * character is the string terminator in C.
521         */
522        protected static class InvalidFileNameException extends RuntimeException {
523
524                /**
525                 * Serial version UID, being used, if the exception
526                 * is serialized.
527                 */
528                private static final long serialVersionUID = 7922042602454350470L;
529
530                /**
531                 * The file name causing the exception.
532                 */
533                private final String name;
534
535                /**
536                 * Creates a new instance.
537                 *
538                 * @param pName    The file name causing the exception.
539                 * @param pMessage A human readable error message.
540                 */
541                public InvalidFileNameException(final String pName, final String pMessage) {
542                        super(pMessage);
543                        name = pName;
544                }
545
546                /**
547                 * Returns the invalid file name.
548                 *
549                 * @return the invalid file name.
550                 */
551                public String getName() {
552                        return name;
553                }
554
555        }
556
557        /**
558         * Utility class for working with streams.
559         */
560        protected static final class Streams {
561
562                /**
563                 * Private constructor, to prevent instantiation.
564                 * This class has only static methods.
565                 */
566                private Streams() {
567                        // Does nothing
568                }
569
570                /**
571                 * Default buffer size for use in
572                 * {@link #copy(InputStream, OutputStream, boolean)}.
573                 */
574                public static final int DEFAULT_BUFFER_SIZE = 8192;
575
576                /**
577                 * Copies the contents of the given {@link InputStream}
578                 * to the given {@link OutputStream}. Shortcut for
579                 * <pre>
580                 *   copy(pInputStream, pOutputStream, new byte[8192]);
581                 * </pre>
582                 *
583                 * @param inputStream       The input stream, which is being read.
584                 *                          It is guaranteed, that {@link InputStream#close()} is called
585                 *                          on the stream.
586                 * @param outputStream      The output stream, to which data should
587                 *                          be written. May be null, in which case the input streams
588                 *                          contents are simply discarded.
589                 * @param closeOutputStream True guarantees, that {@link OutputStream#close()}
590                 *                          is called on the stream. False indicates, that only
591                 *                          {@link OutputStream#flush()} should be called finally.
592                 * @return Number of bytes, which have been copied.
593                 * @throws IOException An I/O error occurred.
594                 */
595                public static long copy(final InputStream inputStream, final OutputStream outputStream,
596                                                                                                                final boolean closeOutputStream)
597                                throws IOException {
598                        return copy(inputStream, outputStream, closeOutputStream, new byte[DEFAULT_BUFFER_SIZE]);
599                }
600
601                /**
602                 * Copies the contents of the given {@link InputStream}
603                 * to the given {@link OutputStream}.
604                 *
605                 * @param inputStream       The input stream, which is being read.
606                 *                          It is guaranteed, that {@link InputStream#close()} is called
607                 *                          on the stream.
608                 * @param outputStream      The output stream, to which data should
609                 *                          be written. May be null, in which case the input streams
610                 *                          contents are simply discarded.
611                 * @param closeOutputStream True guarantees, that {@link OutputStream#close()}
612                 *                          is called on the stream. False indicates, that only
613                 *                          {@link OutputStream#flush()} should be called finally.
614                 * @param buffer            Temporary buffer, which is to be used for
615                 *                          copying data.
616                 * @return Number of bytes, which have been copied.
617                 * @throws IOException An I/O error occurred.
618                 */
619                public static long copy(final InputStream inputStream,
620                                                                                                                final OutputStream outputStream, final boolean closeOutputStream,
621                                                                                                                final byte[] buffer)
622                                throws IOException {
623                        try (OutputStream out = outputStream;
624                                         InputStream in = inputStream) {
625                                long total = 0;
626                                for (; ; ) {
627                                        final int res = in.read(buffer);
628                                        if (res == -1) {
629                                                break;
630                                        }
631                                        if (res > 0) {
632                                                total += res;
633                                                if (out != null) {
634                                                        out.write(buffer, 0, res);
635                                                }
636                                        }
637                                }
638                                if (out != null) {
639                                        if (closeOutputStream) {
640                                                out.close();
641                                        } else {
642                                                out.flush();
643                                        }
644                                }
645                                in.close();
646                                return total;
647                        }
648                }
649
650                /**
651                 * Checks, whether the given file name is valid in the sense,
652                 * that it doesn't contain any NUL characters. If the file name
653                 * is valid, it will be returned without any modifications. Otherwise,
654                 * an {@link InvalidFileNameException} is raised.
655                 *
656                 * @param fileName The file name to check
657                 * @return Unmodified file name, if valid.
658                 * @throws InvalidFileNameException The file name was found to be invalid.
659                 */
660                public static String checkFileName(final String fileName) {
661                        if (fileName != null && fileName.indexOf('\u0000') != -1) {
662                                // pFileName.replace("\u0000", "\\0")
663                                final StringBuilder sb = new StringBuilder();
664                                for (int i = 0; i < fileName.length(); i++) {
665                                        final char c = fileName.charAt(i);
666                                        switch (c) {
667                                                case 0:
668                                                        sb.append("\\0");
669                                                        break;
670                                                default:
671                                                        sb.append(c);
672                                                        break;
673                                        }
674                                }
675                                throw new InvalidFileNameException(fileName,
676                                                "Invalid file name: " + sb);
677                        }
678                        return fileName;
679                }
680
681        }
682
683        /**
684         * <p> Low level API for processing file uploads.
685         *
686         * <p> This class can be used to process data streams conforming to MIME
687         * 'multipart' format as defined in
688         * <a href="http://www.ietf.org/rfc/rfc1867.txt">RFC 1867</a>. Arbitrarily
689         * large amounts of data in the stream can be processed under constant
690         * memory usage.
691         *
692         * <p> The format of the stream is defined in the following way:<br>
693         *
694         * <code>
695         * multipart-body := preamble 1*encapsulation close-delimiter epilogue<br>
696         * encapsulation := delimiter body CRLF<br>
697         * delimiter := "--" boundary CRLF<br>
698         * close-delimiter := "--" boundary "--"<br>
699         * preamble := &lt;ignore&gt;<br>
700         * epilogue := &lt;ignore&gt;<br>
701         * body := header-part CRLF body-part<br>
702         * header-part := 1*header CRLF<br>
703         * header := header-name ":" header-value<br>
704         * header-name := &lt;printable ascii characters except ":"&gt;<br>
705         * header-value := &lt;any ascii characters except CR &amp; LF&gt;<br>
706         * body-data := &lt;arbitrary data&gt;<br>
707         * </code>
708         *
709         * <p>Note that body-data can contain another mulipart entity.  There
710         * is limited support for single pass processing of such nested
711         * streams.  The nested stream is <strong>required</strong> to have a
712         * boundary token of the same length as the parent stream (see {@link
713         * #setBoundary(byte[])}).
714         *
715         * <p>Here is an example of usage of this class.<br>
716         *
717         * <pre>
718         *   try {
719         *     MultipartStream multipartStream = new MultipartStream(input, boundary);
720         *     boolean nextPart = multipartStream.skipPreamble();
721         *     OutputStream output;
722         *     while(nextPart) {
723         *       String header = multipartStream.readHeaders();
724         *       // process headers
725         *       // create some output stream
726         *       multipartStream.readBodyData(output);
727         *       nextPart = multipartStream.readBoundary();
728         *     }
729         *   } catch(MultipartStream.MalformedStreamException e) {
730         *     // the stream failed to follow required syntax
731         *   } catch(IOException e) {
732         *     // a read or write error occurred
733         *   }
734         * </pre>
735         */
736        protected static class MultipartStream {
737                /**
738                 * Internal class, which is used to invoke the
739                 * {@link ProgressListener}.
740                 */
741                public static class ProgressNotifier {
742
743                        /**
744                         * The listener to invoke.
745                         */
746                        private final ProgressListener listener;
747
748                        /**
749                         * Number of expected bytes, if known, or -1.
750                         */
751                        private final long contentLength;
752
753                        /**
754                         * Number of bytes, which have been read so far.
755                         */
756                        private long bytesRead;
757
758                        /**
759                         * Number of items, which have been read so far.
760                         */
761                        private int items;
762
763                        /**
764                         * Creates a new instance with the given listener
765                         * and content length.
766                         *
767                         * @param pListener      The listener to invoke.
768                         * @param pContentLength The expected content length.
769                         */
770                        public ProgressNotifier(final ProgressListener pListener, final long pContentLength) {
771                                listener = pListener;
772                                contentLength = pContentLength;
773                        }
774
775                        /**
776                         * Called to indicate that bytes have been read.
777                         *
778                         * @param pBytes Number of bytes, which have been read.
779                         */
780                        void noteBytesRead(final int pBytes) {
781                                /* Indicates, that the given number of bytes have been read from
782                                 * the input stream.
783                                 */
784                                bytesRead += pBytes;
785                                notifyListener();
786                        }
787
788                        /**
789                         * Called to indicate, that a new file item has been detected.
790                         */
791                        public void noteItem() {
792                                ++items;
793                                notifyListener();
794                        }
795
796                        /**
797                         * Called for notifying the listener.
798                         */
799                        private void notifyListener() {
800                                if (listener != null) {
801                                        listener.update(bytesRead, contentLength, items);
802                                }
803                        }
804
805                }
806
807                // ----------------------------------------------------- Manifest constants
808
809                /**
810                 * The Carriage Return ASCII character value.
811                 */
812                public static final byte CR = 0x0D;
813
814                /**
815                 * The Line Feed ASCII character value.
816                 */
817                public static final byte LF = 0x0A;
818
819                /**
820                 * The dash (-) ASCII character value.
821                 */
822                public static final byte DASH = 0x2D;
823
824                /**
825                 * The maximum length of {@code header-part} that will be
826                 * processed (10 kilobytes = 10240 bytes.).
827                 */
828                public static final int HEADER_PART_SIZE_MAX = 10240;
829
830                /**
831                 * The default length of the buffer used for processing a request.
832                 */
833                protected static final int DEFAULT_BUFSIZE = 4096;
834
835                /**
836                 * A byte sequence that marks the end of {@code header-part}
837                 * ({@code CRLFCRLF}).
838                 */
839                protected static final byte[] HEADER_SEPARATOR = {CR, LF, CR, LF};
840
841                /**
842                 * A byte sequence that that follows a delimiter that will be
843                 * followed by an encapsulation ({@code CRLF}).
844                 */
845                protected static final byte[] FIELD_SEPARATOR = {CR, LF};
846
847                /**
848                 * A byte sequence that that follows a delimiter of the last
849                 * encapsulation in the stream ({@code --}).
850                 */
851                protected static final byte[] STREAM_TERMINATOR = {DASH, DASH};
852
853                /**
854                 * A byte sequence that precedes a boundary ({@code CRLF--}).
855                 */
856                protected static final byte[] BOUNDARY_PREFIX = {CR, LF, DASH, DASH};
857
858                // ----------------------------------------------------------- Data members
859
860                /**
861                 * The input stream from which data is read.
862                 */
863                private final InputStream input;
864
865                /**
866                 * The length of the boundary token plus the leading {@code CRLF--}.
867                 */
868                private int boundaryLength;
869
870                /**
871                 * The amount of data, in bytes, that must be kept in the buffer in order
872                 * to detect delimiters reliably.
873                 */
874                private final int keepRegion;
875
876                /**
877                 * The byte sequence that partitions the stream.
878                 */
879                private final byte[] boundary;
880
881                /**
882                 * The table for Knuth-Morris-Pratt search algorithm.
883                 */
884                private final int[] boundaryTable;
885
886                /**
887                 * The length of the buffer used for processing the request.
888                 */
889                private final int bufSize;
890
891                /**
892                 * The buffer used for processing the request.
893                 */
894                private final byte[] buffer;
895
896                /**
897                 * The index of first valid character in the buffer.
898                 * <br>
899                 * 0 <= head < bufSize
900                 */
901                private int head;
902
903                /**
904                 * The index of last valid character in the buffer + 1.
905                 * <br>
906                 * 0 <= tail <= bufSize
907                 */
908                private int tail;
909
910                /**
911                 * The content encoding to use when reading headers.
912                 */
913                private String headerEncoding;
914
915                /**
916                 * The progress notifier, if any, or null.
917                 */
918                private final ProgressNotifier notifier;
919
920                // ----------------------------------------------------------- Constructors
921
922                /**
923                 * <p> Constructs a {@code MultipartStream} with a custom size buffer.
924                 *
925                 * <p> Note that the buffer must be at least big enough to contain the
926                 * boundary string, plus 4 characters for CR/LF and double dash, plus at
927                 * least one byte of data.  Too small a buffer size setting will degrade
928                 * performance.
929                 *
930                 * @param input     The {@code InputStream} to serve as a data source.
931                 * @param boundary  The token used for dividing the stream into
932                 *                  {@code encapsulations}.
933                 * @param bufSize   The size of the buffer to be used, in bytes.
934                 * @param pNotifier The notifier, which is used for calling the
935                 *                  progress listener, if any.
936                 * @throws IllegalArgumentException If the buffer size is too small
937                 * @since 1.3.1
938                 */
939                public MultipartStream(final InputStream input,
940                                                                                                         final byte[] boundary,
941                                                                                                         final int bufSize,
942                                                                                                         final ProgressNotifier pNotifier) {
943
944                        if (boundary == null) {
945                                throw new IllegalArgumentException("boundary may not be null");
946                        }
947                        // We prepend CR/LF to the boundary to chop trailing CR/LF from
948                        // body-data tokens.
949                        this.boundaryLength = boundary.length + BOUNDARY_PREFIX.length;
950                        if (bufSize < this.boundaryLength + 1) {
951                                throw new IllegalArgumentException(
952                                                "The buffer size specified for the MultipartStream is too small");
953                        }
954
955                        this.input = input;
956                        this.bufSize = Math.max(bufSize, boundaryLength * 2);
957                        this.buffer = new byte[this.bufSize];
958                        this.notifier = pNotifier;
959
960                        this.boundary = new byte[this.boundaryLength];
961                        this.boundaryTable = new int[this.boundaryLength + 1];
962                        this.keepRegion = this.boundary.length;
963
964                        System.arraycopy(BOUNDARY_PREFIX, 0, this.boundary, 0,
965                                        BOUNDARY_PREFIX.length);
966                        System.arraycopy(boundary, 0, this.boundary, BOUNDARY_PREFIX.length,
967                                        boundary.length);
968                        computeBoundaryTable();
969
970                        head = 0;
971                        tail = 0;
972                }
973
974                /**
975                 * <p> Constructs a {@code MultipartStream} with a default size buffer.
976                 *
977                 * @param input     The {@code InputStream} to serve as a data source.
978                 * @param boundary  The token used for dividing the stream into
979                 *                  {@code encapsulations}.
980                 * @param pNotifier An object for calling the progress listener, if any.
981                 * @see #MultipartStream(InputStream, byte[], int, ProgressNotifier)
982                 */
983                public MultipartStream(final InputStream input,
984                                                                                                         final byte[] boundary,
985                                                                                                         final ProgressNotifier pNotifier) {
986                        this(input, boundary, DEFAULT_BUFSIZE, pNotifier);
987                }
988
989                // --------------------------------------------------------- Public methods
990
991                /**
992                 * Retrieves the character encoding used when reading the headers of an
993                 * individual part. When not specified, or {@code null}, the platform
994                 * default encoding is used.
995                 *
996                 * @return The encoding used to read part headers.
997                 */
998                public String getHeaderEncoding() {
999                        return headerEncoding;
1000                }
1001
1002                /**
1003                 * Specifies the character encoding to be used when reading the headers of
1004                 * individual parts. When not specified, or {@code null}, the platform
1005                 * default encoding is used.
1006                 *
1007                 * @param encoding The encoding used to read part headers.
1008                 */
1009                public void setHeaderEncoding(final String encoding) {
1010                        headerEncoding = encoding;
1011                }
1012
1013                /**
1014                 * Reads a byte from the {@code buffer}, and refills it as
1015                 * necessary.
1016                 *
1017                 * @return The next byte from the input stream.
1018                 * @throws IOException if there is no more data available.
1019                 */
1020                public byte readByte() throws IOException {
1021                        // Buffer depleted ?
1022                        if (head == tail) {
1023                                head = 0;
1024                                // Refill.
1025                                tail = input.read(buffer, head, bufSize);
1026                                if (tail == -1) {
1027                                        // No more data available.
1028                                        throw new IOException("No more data is available");
1029                                }
1030                                if (notifier != null) {
1031                                        notifier.noteBytesRead(tail);
1032                                }
1033                        }
1034                        return buffer[head++];
1035                }
1036
1037                /**
1038                 * Skips a {@code boundary} token, and checks whether more
1039                 * {@code encapsulations} are contained in the stream.
1040                 *
1041                 * @return {@code true} if there are more encapsulations in
1042                 * this stream; {@code false} otherwise.
1043                 * @throws FileUploadIOException    if the bytes read from the stream exceeded the size limits
1044                 * @throws MalformedStreamException if the stream ends unexpectedly or
1045                 *                                  fails to follow required syntax.
1046                 */
1047                public boolean readBoundary()
1048                                throws FileUploadIOException, MalformedStreamException {
1049                        final byte[] marker = new byte[2];
1050                        final boolean nextChunk;
1051
1052                        head += boundaryLength;
1053                        try {
1054                                marker[0] = readByte();
1055                                if (marker[0] == LF) {
1056                                        // Work around IE5 Mac bug with input type=image.
1057                                        // Because the boundary delimiter, not including the trailing
1058                                        // CRLF, must not appear within any file (RFC 2046, section
1059                                        // 5.1.1), we know the missing CR is due to a buggy browser
1060                                        // rather than a file containing something similar to a
1061                                        // boundary.
1062                                        return true;
1063                                }
1064
1065                                marker[1] = readByte();
1066                                if (arrayequals(marker, STREAM_TERMINATOR, 2)) {
1067                                        nextChunk = false;
1068                                } else if (arrayequals(marker, FIELD_SEPARATOR, 2)) {
1069                                        nextChunk = true;
1070                                } else {
1071                                        throw new MalformedStreamException(
1072                                                        "Unexpected characters follow a boundary");
1073                                }
1074                        } catch (final FileUploadIOException e) {
1075                                // wraps a SizeException, re-throw as it will be unwrapped later
1076                                throw e;
1077                        } catch (final IOException e) {
1078                                throw new MalformedStreamException("Stream ended unexpectedly");
1079                        }
1080                        return nextChunk;
1081                }
1082
1083                /**
1084                 * <p>Changes the boundary token used for partitioning the stream.
1085                 *
1086                 * <p>This method allows single pass processing of nested multipart
1087                 * streams.
1088                 *
1089                 * <p>The boundary token of the nested stream is {@code required}
1090                 * to be of the same length as the boundary token in parent stream.
1091                 *
1092                 * <p>Restoring the parent stream boundary token after processing of a
1093                 * nested stream is left to the application.
1094                 *
1095                 * @param boundary The boundary to be used for parsing of the nested
1096                 *                 stream.
1097                 * @throws IllegalBoundaryException if the {@code boundary}
1098                 *                                  has a different length than the one
1099                 *                                  being currently parsed.
1100                 */
1101                public void setBoundary(final byte[] boundary)
1102                                throws IllegalBoundaryException {
1103                        if (boundary.length != boundaryLength - BOUNDARY_PREFIX.length) {
1104                                throw new IllegalBoundaryException(
1105                                                "The length of a boundary token cannot be changed");
1106                        }
1107                        System.arraycopy(boundary, 0, this.boundary, BOUNDARY_PREFIX.length,
1108                                        boundary.length);
1109                        computeBoundaryTable();
1110                }
1111
1112                /**
1113                 * Compute the table used for Knuth-Morris-Pratt search algorithm.
1114                 */
1115                private void computeBoundaryTable() {
1116                        int position = 2;
1117                        int candidate = 0;
1118
1119                        boundaryTable[0] = -1;
1120                        boundaryTable[1] = 0;
1121
1122                        while (position <= boundaryLength) {
1123                                if (boundary[position - 1] == boundary[candidate]) {
1124                                        boundaryTable[position] = candidate + 1;
1125                                        candidate++;
1126                                        position++;
1127                                } else if (candidate > 0) {
1128                                        candidate = boundaryTable[candidate];
1129                                } else {
1130                                        boundaryTable[position] = 0;
1131                                        position++;
1132                                }
1133                        }
1134                }
1135
1136                /**
1137                 * <p>Reads the {@code header-part} of the current
1138                 * {@code encapsulation}.
1139                 *
1140                 * <p>Headers are returned verbatim to the input stream, including the
1141                 * trailing {@code CRLF} marker. Parsing is left to the
1142                 * application.
1143                 *
1144                 * <p><strong>TODO</strong> allow limiting maximum header size to
1145                 * protect against abuse.
1146                 *
1147                 * @return The {@code header-part} of the current encapsulation.
1148                 * @throws FileUploadIOException    if the bytes read from the stream exceeded the size limits.
1149                 * @throws MalformedStreamException if the stream ends unexpectedly.
1150                 */
1151                public String readHeaders() throws FileUploadIOException, MalformedStreamException {
1152                        int i = 0;
1153                        byte b;
1154                        // to support multi-byte characters
1155                        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
1156                        int size = 0;
1157                        while (i < HEADER_SEPARATOR.length) {
1158                                try {
1159                                        b = readByte();
1160                                } catch (final FileUploadIOException e) {
1161                                        // wraps a SizeException, re-throw as it will be unwrapped later
1162                                        throw e;
1163                                } catch (final IOException e) {
1164                                        throw new MalformedStreamException("Stream ended unexpectedly");
1165                                }
1166                                if (++size > HEADER_PART_SIZE_MAX) {
1167                                        throw new MalformedStreamException(String.format(
1168                                                        "Header section has more than %s bytes (maybe it is not properly terminated)",
1169                                                        Integer.valueOf(HEADER_PART_SIZE_MAX)));
1170                                }
1171                                if (b == HEADER_SEPARATOR[i]) {
1172                                        i++;
1173                                } else {
1174                                        i = 0;
1175                                }
1176                                baos.write(b);
1177                        }
1178
1179                        String headers;
1180                        if (headerEncoding != null) {
1181                                try {
1182                                        headers = baos.toString(headerEncoding);
1183                                } catch (final UnsupportedEncodingException e) {
1184                                        // Fall back to platform default if specified encoding is not
1185                                        // supported.
1186                                        headers = baos.toString();
1187                                }
1188                        } else {
1189                                headers = baos.toString();
1190                        }
1191
1192                        return headers;
1193                }
1194
1195                /**
1196                 * <p>Reads {@code body-data} from the current
1197                 * {@code encapsulation} and writes its contents into the
1198                 * output {@code Stream}.
1199                 *
1200                 * <p>Arbitrary large amounts of data can be processed by this
1201                 * method using a constant size buffer. (see {@link
1202                 * #MultipartStream(InputStream, byte[], int,
1203                 * MultipartStream.ProgressNotifier) constructor}).
1204                 *
1205                 * @param output The {@code Stream} to write data into. May
1206                 *               be null, in which case this method is equivalent
1207                 *               to {@link #discardBodyData()}.
1208                 * @return the amount of data written.
1209                 * @throws MalformedStreamException if the stream ends unexpectedly.
1210                 * @throws IOException              if an i/o error occurs.
1211                 */
1212                public int readBodyData(final OutputStream output)
1213                                throws MalformedStreamException, IOException {
1214                        return (int) Streams.copy(newInputStream(), output, false); // N.B. Streams.copy closes the input stream
1215                }
1216
1217                /**
1218                 * Creates a new {@link ItemInputStream}.
1219                 *
1220                 * @return A new instance of {@link ItemInputStream}.
1221                 */
1222                public ItemInputStream newInputStream() {
1223                        return new ItemInputStream();
1224                }
1225
1226                /**
1227                 * <p> Reads {@code body-data} from the current
1228                 * {@code encapsulation} and discards it.
1229                 *
1230                 * <p>Use this method to skip encapsulations you don't need or don't
1231                 * understand.
1232                 *
1233                 * @return The amount of data discarded.
1234                 * @throws MalformedStreamException if the stream ends unexpectedly.
1235                 * @throws IOException              if an i/o error occurs.
1236                 */
1237                public int discardBodyData() throws MalformedStreamException, IOException {
1238                        return readBodyData(null);
1239                }
1240
1241                /**
1242                 * Finds the beginning of the first {@code encapsulation}.
1243                 *
1244                 * @return {@code true} if an {@code encapsulation} was found in
1245                 * the stream.
1246                 * @throws IOException if an i/o error occurs.
1247                 */
1248                public boolean skipPreamble() throws IOException {
1249                        // First delimiter may be not preceded with a CRLF.
1250                        System.arraycopy(boundary, 2, boundary, 0, boundary.length - 2);
1251                        boundaryLength = boundary.length - 2;
1252                        computeBoundaryTable();
1253                        try {
1254                                // Discard all data up to the delimiter.
1255                                discardBodyData();
1256
1257                                // Read boundary - if succeeded, the stream contains an
1258                                // encapsulation.
1259                                return readBoundary();
1260                        } catch (final MalformedStreamException e) {
1261                                return false;
1262                        } finally {
1263                                // Restore delimiter.
1264                                System.arraycopy(boundary, 0, boundary, 2, boundary.length - 2);
1265                                boundaryLength = boundary.length;
1266                                boundary[0] = CR;
1267                                boundary[1] = LF;
1268                                computeBoundaryTable();
1269                        }
1270                }
1271
1272                /**
1273                 * Compares {@code count} first bytes in the arrays
1274                 * {@code a} and {@code b}.
1275                 *
1276                 * @param a     The first array to compare.
1277                 * @param b     The second array to compare.
1278                 * @param count How many bytes should be compared.
1279                 * @return {@code true} if {@code count} first bytes in arrays
1280                 * {@code a} and {@code b} are equal.
1281                 */
1282                public static boolean arrayequals(final byte[] a,
1283                                                                                                                                                        final byte[] b,
1284                                                                                                                                                        final int count) {
1285                        for (int i = 0; i < count; i++) {
1286                                if (a[i] != b[i]) {
1287                                        return false;
1288                                }
1289                        }
1290                        return true;
1291                }
1292
1293                /**
1294                 * Searches for the {@code boundary} in the {@code buffer}
1295                 * region delimited by {@code head} and {@code tail}.
1296                 *
1297                 * @return The position of the boundary found, counting from the
1298                 * beginning of the {@code buffer}, or {@code -1} if
1299                 * not found.
1300                 */
1301                protected int findSeparator() {
1302
1303                        int bufferPos = this.head;
1304                        int tablePos = 0;
1305
1306                        while (bufferPos < this.tail) {
1307                                while (tablePos >= 0 && buffer[bufferPos] != boundary[tablePos]) {
1308                                        tablePos = boundaryTable[tablePos];
1309                                }
1310                                bufferPos++;
1311                                tablePos++;
1312                                if (tablePos == boundaryLength) {
1313                                        return bufferPos - boundaryLength;
1314                                }
1315                        }
1316                        return -1;
1317                }
1318
1319                /**
1320                 * Thrown to indicate that the input stream fails to follow the
1321                 * required syntax.
1322                 */
1323                public static class MalformedStreamException extends IOException {
1324
1325                        /**
1326                         * The UID to use when serializing this instance.
1327                         */
1328                        private static final long serialVersionUID = 6466926458059796677L;
1329
1330                        /**
1331                         * Constructs a {@code MalformedStreamException} with no
1332                         * detail message.
1333                         */
1334                        public MalformedStreamException() {
1335                        }
1336
1337                        /**
1338                         * Constructs an {@code MalformedStreamException} with
1339                         * the specified detail message.
1340                         *
1341                         * @param message The detail message.
1342                         */
1343                        public MalformedStreamException(final String message) {
1344                                super(message);
1345                        }
1346
1347                }
1348
1349                /**
1350                 * Thrown upon attempt of setting an invalid boundary token.
1351                 */
1352                public static class IllegalBoundaryException extends IOException {
1353
1354                        /**
1355                         * The UID to use when serializing this instance.
1356                         */
1357                        private static final long serialVersionUID = -161533165102632918L;
1358
1359                        /**
1360                         * Constructs an {@code IllegalBoundaryException} with no
1361                         * detail message.
1362                         */
1363                        public IllegalBoundaryException() {
1364                        }
1365
1366                        /**
1367                         * Constructs an {@code IllegalBoundaryException} with
1368                         * the specified detail message.
1369                         *
1370                         * @param message The detail message.
1371                         */
1372                        public IllegalBoundaryException(final String message) {
1373                                super(message);
1374                        }
1375
1376                }
1377
1378                /**
1379                 * An {@link InputStream} for reading an items contents.
1380                 */
1381                public class ItemInputStream extends InputStream implements Closeable {
1382
1383                        /**
1384                         * The number of bytes, which have been read so far.
1385                         */
1386                        private long total;
1387
1388                        /**
1389                         * The number of bytes, which must be hold, because
1390                         * they might be a part of the boundary.
1391                         */
1392                        private int pad;
1393
1394                        /**
1395                         * The current offset in the buffer.
1396                         */
1397                        private int pos;
1398
1399                        /**
1400                         * Whether the stream is already closed.
1401                         */
1402                        private boolean closed;
1403
1404                        /**
1405                         * Creates a new instance.
1406                         */
1407                        ItemInputStream() {
1408                                findSeparator();
1409                        }
1410
1411                        /**
1412                         * Called for finding the separator.
1413                         */
1414                        private void findSeparator() {
1415                                pos = MultipartStream.this.findSeparator();
1416                                if (pos == -1) {
1417                                        if (tail - head > keepRegion) {
1418                                                pad = keepRegion;
1419                                        } else {
1420                                                pad = tail - head;
1421                                        }
1422                                }
1423                        }
1424
1425                        /**
1426                         * Returns the number of bytes, which have been read
1427                         * by the stream.
1428                         *
1429                         * @return Number of bytes, which have been read so far.
1430                         */
1431                        public long getBytesRead() {
1432                                return total;
1433                        }
1434
1435                        /**
1436                         * Returns the number of bytes, which are currently
1437                         * available, without blocking.
1438                         *
1439                         * @return Number of bytes in the buffer.
1440                         * @throws IOException An I/O error occurs.
1441                         */
1442                        @Override
1443                        public int available() throws IOException {
1444                                if (pos == -1) {
1445                                        return tail - head - pad;
1446                                }
1447                                return pos - head;
1448                        }
1449
1450                        /**
1451                         * Offset when converting negative bytes to integers.
1452                         */
1453                        private static final int BYTE_POSITIVE_OFFSET = 256;
1454
1455                        /**
1456                         * Returns the next byte in the stream.
1457                         *
1458                         * @return The next byte in the stream, as a non-negative
1459                         * integer, or -1 for EOF.
1460                         * @throws IOException An I/O error occurred.
1461                         */
1462                        @Override
1463                        public int read() throws IOException {
1464                                if (closed) {
1465                                        throw new FileItemStream.ItemSkippedException();
1466                                }
1467                                if (available() == 0 && makeAvailable() == 0) {
1468                                        return -1;
1469                                }
1470                                ++total;
1471                                final int b = buffer[head++];
1472                                if (b >= 0) {
1473                                        return b;
1474                                }
1475                                return b + BYTE_POSITIVE_OFFSET;
1476                        }
1477
1478                        /**
1479                         * Reads bytes into the given buffer.
1480                         *
1481                         * @param b   The destination buffer, where to write to.
1482                         * @param off Offset of the first byte in the buffer.
1483                         * @param len Maximum number of bytes to read.
1484                         * @return Number of bytes, which have been actually read,
1485                         * or -1 for EOF.
1486                         * @throws IOException An I/O error occurred.
1487                         */
1488                        @Override
1489                        public int read(final byte[] b, final int off, final int len) throws IOException {
1490                                if (closed) {
1491                                        throw new FileItemStream.ItemSkippedException();
1492                                }
1493                                if (len == 0) {
1494                                        return 0;
1495                                }
1496                                int res = available();
1497                                if (res == 0) {
1498                                        res = makeAvailable();
1499                                        if (res == 0) {
1500                                                return -1;
1501                                        }
1502                                }
1503                                res = Math.min(res, len);
1504                                System.arraycopy(buffer, head, b, off, res);
1505                                head += res;
1506                                total += res;
1507                                return res;
1508                        }
1509
1510                        /**
1511                         * Closes the input stream.
1512                         *
1513                         * @throws IOException An I/O error occurred.
1514                         */
1515                        @Override
1516                        public void close() throws IOException {
1517                                close(false);
1518                        }
1519
1520                        /**
1521                         * Closes the input stream.
1522                         *
1523                         * @param pCloseUnderlying Whether to close the underlying stream
1524                         *                         (hard close)
1525                         * @throws IOException An I/O error occurred.
1526                         */
1527                        public void close(final boolean pCloseUnderlying) throws IOException {
1528                                if (closed) {
1529                                        return;
1530                                }
1531                                if (pCloseUnderlying) {
1532                                        closed = true;
1533                                        input.close();
1534                                } else {
1535                                        for (; ; ) {
1536                                                int av = available();
1537                                                if (av == 0) {
1538                                                        av = makeAvailable();
1539                                                        if (av == 0) {
1540                                                                break;
1541                                                        }
1542                                                }
1543                                                skip(av);
1544                                        }
1545                                }
1546                                closed = true;
1547                        }
1548
1549                        /**
1550                         * Skips the given number of bytes.
1551                         *
1552                         * @param bytes Number of bytes to skip.
1553                         * @return The number of bytes, which have actually been
1554                         * skipped.
1555                         * @throws IOException An I/O error occurred.
1556                         */
1557                        @Override
1558                        public long skip(final long bytes) throws IOException {
1559                                if (closed) {
1560                                        throw new FileItemStream.ItemSkippedException();
1561                                }
1562                                int av = available();
1563                                if (av == 0) {
1564                                        av = makeAvailable();
1565                                        if (av == 0) {
1566                                                return 0;
1567                                        }
1568                                }
1569                                final long res = Math.min(av, bytes);
1570                                head += res;
1571                                return res;
1572                        }
1573
1574                        /**
1575                         * Attempts to read more data.
1576                         *
1577                         * @return Number of available bytes
1578                         * @throws IOException An I/O error occurred.
1579                         */
1580                        private int makeAvailable() throws IOException {
1581                                if (pos != -1) {
1582                                        return 0;
1583                                }
1584
1585                                // Move the data to the beginning of the buffer.
1586                                total += tail - head - pad;
1587                                System.arraycopy(buffer, tail - pad, buffer, 0, pad);
1588
1589                                // Refill buffer with new data.
1590                                head = 0;
1591                                tail = pad;
1592
1593                                for (; ; ) {
1594                                        final int bytesRead = input.read(buffer, tail, bufSize - tail);
1595                                        if (bytesRead == -1) {
1596                                                // The last pad amount is left in the buffer.
1597                                                // Boundary can't be in there so signal an error
1598                                                // condition.
1599                                                final String msg = "Stream ended unexpectedly";
1600                                                throw new MalformedStreamException(msg);
1601                                        }
1602                                        if (notifier != null) {
1603                                                notifier.noteBytesRead(bytesRead);
1604                                        }
1605                                        tail += bytesRead;
1606
1607                                        findSeparator();
1608                                        final int av = available();
1609
1610                                        if (av > 0 || pos != -1) {
1611                                                return av;
1612                                        }
1613                                }
1614                        }
1615
1616                        /**
1617                         * Returns, whether the stream is closed.
1618                         *
1619                         * @return True, if the stream is closed, otherwise false.
1620                         */
1621                        public boolean isClosed() {
1622                                return closed;
1623                        }
1624
1625                }
1626        }
1627
1628        /**
1629         * A simple parser intended to parse sequences of name/value pairs.
1630         * <p>
1631         * Parameter values are expected to be enclosed in quotes if they contain unsafe characters, such as '=' characters or separators. Parameter values are optional
1632         * and can be omitted.
1633         * </p>
1634         * <p>
1635         * {@code param1 = value; param2 = "anything goes; really"; param3}
1636         * </p>
1637         */
1638        protected class ParameterParser {
1639
1640                /**
1641                 * String to be parsed.
1642                 */
1643                private char[] chars = null;
1644
1645                /**
1646                 * Current position in the string.
1647                 */
1648                private int pos = 0;
1649
1650                /**
1651                 * Maximum position in the string.
1652                 */
1653                private int len = 0;
1654
1655                /**
1656                 * Start of a token.
1657                 */
1658                private int i1 = 0;
1659
1660                /**
1661                 * End of a token.
1662                 */
1663                private int i2 = 0;
1664
1665                /**
1666                 * Whether names stored in the map should be converted to lower case.
1667                 */
1668                private boolean lowerCaseNames = false;
1669
1670                /**
1671                 * Default ParameterParser constructor.
1672                 */
1673                public ParameterParser() {
1674                }
1675
1676                /**
1677                 * A helper method to process the parsed token. This method removes leading and trailing blanks as well as enclosing quotation marks, when necessary.
1678                 *
1679                 * @param quoted {@code true} if quotation marks are expected, {@code false} otherwise.
1680                 * @return the token
1681                 */
1682                private String getToken(final boolean quoted) {
1683                        // Trim leading white spaces
1684                        while (i1 < i2 && Character.isWhitespace(chars[i1])) {
1685                                i1++;
1686                        }
1687                        // Trim trailing white spaces
1688                        while (i2 > i1 && Character.isWhitespace(chars[i2 - 1])) {
1689                                i2--;
1690                        }
1691                        // Strip away quotation marks if necessary
1692                        if (quoted && i2 - i1 >= 2 && chars[i1] == '"' && chars[i2 - 1] == '"') {
1693                                i1++;
1694                                i2--;
1695                        }
1696                        String result = null;
1697                        if (i2 > i1) {
1698                                result = new String(chars, i1, i2 - i1);
1699                        }
1700                        return result;
1701                }
1702
1703                /**
1704                 * Tests if there any characters left to parse.
1705                 *
1706                 * @return {@code true} if there are unparsed characters, {@code false} otherwise.
1707                 */
1708                private boolean hasChar() {
1709                        return this.pos < this.len;
1710                }
1711
1712                /**
1713                 * Tests {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed.
1714                 *
1715                 * @return {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed. Otherwise returns {@code false}
1716                 */
1717                public boolean isLowerCaseNames() {
1718                        return this.lowerCaseNames;
1719                }
1720
1721                /**
1722                 * Tests if the given character is present in the array of characters.
1723                 *
1724                 * @param ch      the character to test for presence in the array of characters
1725                 * @param charray the array of characters to test against
1726                 * @return {@code true} if the character is present in the array of characters, {@code false} otherwise.
1727                 */
1728                private boolean isOneOf(final char ch, final char[] charray) {
1729                        var result = false;
1730                        for (final char element : charray) {
1731                                if (ch == element) {
1732                                        result = true;
1733                                        break;
1734                                }
1735                        }
1736                        return result;
1737                }
1738
1739                /**
1740                 * Parses a map of name/value pairs from the given array of characters. Names are expected to be unique.
1741                 *
1742                 * @param charArray the array of characters that contains a sequence of name/value pairs
1743                 * @param separator the name/value pairs separator
1744                 * @return a map of name/value pairs
1745                 */
1746                public Map<String, String> parse(final char[] charArray, final char separator) {
1747                        if (charArray == null) {
1748                                return new LinkedHashMap<>();
1749                        }
1750                        return parse(charArray, 0, charArray.length, separator);
1751                }
1752
1753                /**
1754                 * Parses a map of name/value pairs from the given array of characters. Names are expected to be unique.
1755                 *
1756                 * @param charArray the array of characters that contains a sequence of name/value pairs
1757                 * @param offset    - the initial offset.
1758                 * @param length    - the length.
1759                 * @param separator the name/value pairs separator
1760                 * @return a map of name/value pairs
1761                 */
1762                public Map<String, String> parse(final char[] charArray, final int offset, final int length, final char separator) {
1763
1764                        if (charArray == null) {
1765                                return new LinkedHashMap<>();
1766                        }
1767                        final var params = new LinkedHashMap<String, String>();
1768                        this.chars = charArray.clone();
1769                        this.pos = offset;
1770                        this.len = length;
1771
1772                        String paramName;
1773                        String paramValue;
1774                        while (hasChar()) {
1775                                paramName = parseToken(new char[]{'=', separator});
1776                                paramValue = null;
1777                                if (hasChar() && charArray[pos] == '=') {
1778                                        pos++; // skip '='
1779                                        paramValue = parseQuotedToken(new char[]{separator});
1780
1781                                        if (paramValue != null) {
1782                                                try {
1783                                                        paramValue = RFC2231Utils.hasEncodedValue(paramName) ? RFC2231Utils.decodeText(paramValue) : MimeUtils.decodeText(paramValue);
1784                                                } catch (final UnsupportedEncodingException ignored) {
1785                                                        // let's keep the original value in this case
1786                                                }
1787                                        }
1788                                }
1789                                if (hasChar() && charArray[pos] == separator) {
1790                                        pos++; // skip separator
1791                                }
1792                                if (paramName != null && !paramName.isEmpty()) {
1793                                        paramName = RFC2231Utils.stripDelimiter(paramName);
1794                                        if (this.lowerCaseNames) {
1795                                                paramName = paramName.toLowerCase(Locale.ENGLISH);
1796                                        }
1797                                        params.put(paramName, paramValue);
1798                                }
1799                        }
1800                        return params;
1801                }
1802
1803                /**
1804                 * Parses a map of name/value pairs from the given string. Names are expected to be unique.
1805                 *
1806                 * @param str       the string that contains a sequence of name/value pairs
1807                 * @param separator the name/value pairs separator
1808                 * @return a map of name/value pairs
1809                 */
1810                public Map<String, String> parse(final String str, final char separator) {
1811                        if (str == null) {
1812                                return new LinkedHashMap<>();
1813                        }
1814                        return parse(str.toCharArray(), separator);
1815                }
1816
1817                /**
1818                 * Parses a map of name/value pairs from the given string. Names are expected to be unique. Multiple separators may be specified and the earliest found in
1819                 * the input string is used.
1820                 *
1821                 * @param str        the string that contains a sequence of name/value pairs
1822                 * @param separators the name/value pairs separators
1823                 * @return a map of name/value pairs
1824                 */
1825                public Map<String, String> parse(final String str, final char[] separators) {
1826                        if (separators == null || separators.length == 0) {
1827                                return new LinkedHashMap<>();
1828                        }
1829                        var separator = separators[0];
1830                        if (str != null) {
1831                                var idx = str.length();
1832                                for (final char separator2 : separators) {
1833                                        final var tmp = str.indexOf(separator2);
1834                                        if (tmp != -1 && tmp < idx) {
1835                                                idx = tmp;
1836                                                separator = separator2;
1837                                        }
1838                                }
1839                        }
1840                        return parse(str, separator);
1841                }
1842
1843                /**
1844                 * Parses out a token until any of the given terminators is encountered outside the quotation marks.
1845                 *
1846                 * @param terminators the array of terminating characters. Any of these characters when encountered outside the quotation marks signify the end of the token
1847                 * @return the token
1848                 */
1849                private String parseQuotedToken(final char[] terminators) {
1850                        char ch;
1851                        i1 = pos;
1852                        i2 = pos;
1853                        var quoted = false;
1854                        var charEscaped = false;
1855                        while (hasChar()) {
1856                                ch = chars[pos];
1857                                if (!quoted && isOneOf(ch, terminators)) {
1858                                        break;
1859                                }
1860                                if (!charEscaped && ch == '"') {
1861                                        quoted = !quoted;
1862                                }
1863                                charEscaped = !charEscaped && ch == '\\';
1864                                i2++;
1865                                pos++;
1866
1867                        }
1868                        return getToken(true);
1869                }
1870
1871                /**
1872                 * Parses out a token until any of the given terminators is encountered.
1873                 *
1874                 * @param terminators the array of terminating characters. Any of these characters when encountered signify the end of the token
1875                 * @return the token
1876                 */
1877                private String parseToken(final char[] terminators) {
1878                        char ch;
1879                        i1 = pos;
1880                        i2 = pos;
1881                        while (hasChar()) {
1882                                ch = chars[pos];
1883                                if (isOneOf(ch, terminators)) {
1884                                        break;
1885                                }
1886                                i2++;
1887                                pos++;
1888                        }
1889                        return getToken(false);
1890                }
1891
1892                /**
1893                 * Sets the flag if parameter names are to be converted to lower case when name/value pairs are parsed.
1894                 *
1895                 * @param lowerCaseNames {@code true} if parameter names are to be converted to lower case when name/value pairs are parsed. {@code false} otherwise.
1896                 */
1897                public void setLowerCaseNames(final boolean lowerCaseNames) {
1898                        this.lowerCaseNames = lowerCaseNames;
1899                }
1900
1901        }
1902
1903        /**
1904         * Utility class to decode/encode character set on HTTP Header fields based on RFC 2231. This implementation adheres to RFC 5987 in particular, which was
1905         * defined for HTTP headers
1906         * <p>
1907         * RFC 5987 builds on RFC 2231, but has lesser scope like <a href="https://tools.ietf.org/html/rfc5987#section-3.2">mandatory charset definition</a> and
1908         * <a href="https://tools.ietf.org/html/rfc5987#section-4">no parameter continuation</a>
1909         * </p>
1910         *
1911         * @see <a href="https://tools.ietf.org/html/rfc2231">RFC 2231</a>
1912         * @see <a href="https://tools.ietf.org/html/rfc5987">RFC 5987</a>
1913         */
1914        protected final class RFC2231Utils {
1915
1916                /**
1917                 * The Hexadecimal values char array.
1918                 */
1919                private static final char[] HEX_DIGITS = "0123456789ABCDEF".toCharArray();
1920                /**
1921                 * The Hexadecimal representation of 127.
1922                 */
1923                private static final byte MASK = 0x7f;
1924                /**
1925                 * The Hexadecimal representation of 128.
1926                 */
1927                private static final int MASK_128 = 0x80;
1928                /**
1929                 * The Hexadecimal decode value.
1930                 */
1931                private static final byte[] HEX_DECODE = new byte[MASK_128];
1932
1933                // create a ASCII decoded array of Hexadecimal values
1934                static {
1935                        for (var i = 0; i < HEX_DIGITS.length; i++) {
1936                                HEX_DECODE[HEX_DIGITS[i]] = (byte) i;
1937                                HEX_DECODE[Character.toLowerCase(HEX_DIGITS[i])] = (byte) i;
1938                        }
1939                }
1940
1941                /**
1942                 * Decodes a string of text obtained from a HTTP header as per RFC 2231
1943                 *
1944                 * <b>Eg 1.</b> {@code us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A} will be decoded to {@code This is ***fun***}
1945                 *
1946                 * <b>Eg 2.</b> {@code iso-8859-1'en'%A3%20rate} will be decoded to {@code £ rate}.
1947                 *
1948                 * <b>Eg 3.</b> {@code UTF-8''%c2%a3%20and%20%e2%82%ac%20rates} will be decoded to {@code £ and € rates}.
1949                 *
1950                 * @param encodedText - Text to be decoded has a format of {@code <charset>'<language>'<encoded_value>} and ASCII only
1951                 * @return Decoded text based on charset encoding
1952                 * @throws UnsupportedEncodingException The requested character set wasn't found.
1953                 */
1954                static String decodeText(final String encodedText) throws UnsupportedEncodingException {
1955                        final var langDelimitStart = encodedText.indexOf('\'');
1956                        if (langDelimitStart == -1) {
1957                                // missing charset
1958                                return encodedText;
1959                        }
1960                        final var mimeCharset = encodedText.substring(0, langDelimitStart);
1961                        final var langDelimitEnd = encodedText.indexOf('\'', langDelimitStart + 1);
1962                        if (langDelimitEnd == -1) {
1963                                // missing language
1964                                return encodedText;
1965                        }
1966                        final var bytes = fromHex(encodedText.substring(langDelimitEnd + 1));
1967                        return new String(bytes, getJavaCharset(mimeCharset));
1968                }
1969
1970                /**
1971                 * Converts {@code text} to their corresponding Hex value.
1972                 *
1973                 * @param text - ASCII text input
1974                 * @return Byte array of characters decoded from ASCII table
1975                 */
1976                private static byte[] fromHex(final String text) {
1977                        final var shift = 4;
1978                        final var out = new ByteArrayOutputStream(text.length());
1979                        for (var i = 0; i < text.length(); ) {
1980                                final var c = text.charAt(i++);
1981                                if (c == '%') {
1982                                        if (i > text.length() - 2) {
1983                                                break; // unterminated sequence
1984                                        }
1985                                        final var b1 = HEX_DECODE[text.charAt(i++) & MASK];
1986                                        final var b2 = HEX_DECODE[text.charAt(i++) & MASK];
1987                                        out.write(b1 << shift | b2);
1988                                } else {
1989                                        out.write((byte) c);
1990                                }
1991                        }
1992                        return out.toByteArray();
1993                }
1994
1995                private static String getJavaCharset(final String mimeCharset) {
1996                        // good enough for standard values
1997                        return mimeCharset;
1998                }
1999
2000                /**
2001                 * Tests if asterisk (*) at the end of parameter name to indicate, if it has charset and language information to decode the value.
2002                 *
2003                 * @param paramName The parameter, which is being checked.
2004                 * @return {@code true}, if encoded as per RFC 2231, {@code false} otherwise
2005                 */
2006                static boolean hasEncodedValue(final String paramName) {
2007                        if (paramName != null) {
2008                                return paramName.lastIndexOf('*') == paramName.length() - 1;
2009                        }
2010                        return false;
2011                }
2012
2013                /**
2014                 * If {@code paramName} has Asterisk (*) at the end, it will be stripped off, else the passed value will be returned.
2015                 *
2016                 * @param paramName The parameter, which is being inspected.
2017                 * @return stripped {@code paramName} of Asterisk (*), if RFC2231 encoded
2018                 */
2019                static String stripDelimiter(final String paramName) {
2020                        if (hasEncodedValue(paramName)) {
2021                                final var paramBuilder = new StringBuilder(paramName);
2022                                paramBuilder.deleteCharAt(paramName.lastIndexOf('*'));
2023                                return paramBuilder.toString();
2024                        }
2025                        return paramName;
2026                }
2027
2028                /**
2029                 * Private constructor so that no instances can be created. This class contains only static utility methods.
2030                 */
2031                private RFC2231Utils() {
2032                }
2033        }
2034
2035        /**
2036         * Utility class to decode MIME texts.
2037         */
2038        protected final class MimeUtils {
2039
2040                /**
2041                 * The marker to indicate text is encoded with BASE64 algorithm.
2042                 */
2043                private static final String BASE64_ENCODING_MARKER = "B";
2044
2045                /**
2046                 * The marker to indicate text is encoded with QuotedPrintable algorithm.
2047                 */
2048                private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
2049
2050                /**
2051                 * If the text contains any encoded tokens, those tokens will be marked with "=?".
2052                 */
2053                private static final String ENCODED_TOKEN_MARKER = "=?";
2054
2055                /**
2056                 * If the text contains any encoded tokens, those tokens will terminate with "=?".
2057                 */
2058                private static final String ENCODED_TOKEN_FINISHER = "?=";
2059
2060                /**
2061                 * The linear whitespace chars sequence.
2062                 */
2063                private static final String LINEAR_WHITESPACE = " \t\r\n";
2064
2065                /**
2066                 * Mappings between MIME and Java charset.
2067                 */
2068                private static final Map<String, String> MIME2JAVA = new HashMap<>();
2069
2070                static {
2071                        MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
2072                        MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
2073                        MIME2JAVA.put("utf-8", "UTF8");
2074                        MIME2JAVA.put("utf8", "UTF8");
2075                        MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
2076                        MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
2077                        MIME2JAVA.put("euc-kr", "KSC5601");
2078                        MIME2JAVA.put("euckr", "KSC5601");
2079                        MIME2JAVA.put("us-ascii", "ISO-8859-1");
2080                        MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
2081                }
2082
2083                /**
2084                 * Decodes a string of text obtained from a mail header into its proper form. The text generally will consist of a string of tokens, some of which may be
2085                 * encoded using base64 encoding.
2086                 *
2087                 * @param text The text to decode.
2088                 * @return The decoded text string.
2089                 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
2090                 */
2091                static String decodeText(final String text) throws UnsupportedEncodingException {
2092                        // if the text contains any encoded tokens, those tokens will be marked with "=?". If the
2093                        // source string doesn't contain that sequent, no decoding is required.
2094                        if (!text.contains(ENCODED_TOKEN_MARKER)) {
2095                                return text;
2096                        }
2097
2098                        var offset = 0;
2099                        final var endOffset = text.length();
2100
2101                        var startWhiteSpace = -1;
2102                        var endWhiteSpace = -1;
2103
2104                        final var decodedText = new StringBuilder(text.length());
2105
2106                        var previousTokenEncoded = false;
2107
2108                        while (offset < endOffset) {
2109                                var ch = text.charAt(offset);
2110
2111                                // is this a whitespace character?
2112                                if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
2113                                        startWhiteSpace = offset;
2114                                        while (offset < endOffset) {
2115                                                // step over the white space characters.
2116                                                ch = text.charAt(offset);
2117                                                if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
2118                                                        // record the location of the first non lwsp and drop down to process the
2119                                                        // token characters.
2120                                                        endWhiteSpace = offset;
2121                                                        break;
2122                                                }
2123                                                offset++;
2124                                        }
2125                                } else {
2126                                        // we have a word token. We need to scan over the word and then try to parse it.
2127                                        final var wordStart = offset;
2128
2129                                        while (offset < endOffset) {
2130                                                // step over the non white space characters.
2131                                                ch = text.charAt(offset);
2132                                                if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
2133                                                        break;
2134                                                }
2135                                                offset++;
2136
2137                                                // NB: Trailing whitespace on these header strings will just be discarded.
2138                                        }
2139                                        // pull out the word token.
2140                                        final var word = text.substring(wordStart, offset);
2141                                        // is the token encoded? decode the word
2142                                        if (word.startsWith(ENCODED_TOKEN_MARKER)) {
2143                                                try {
2144                                                        // if this gives a parsing failure, treat it like a non-encoded word.
2145                                                        final var decodedWord = decodeWord(word);
2146
2147                                                        // are any whitespace characters significant? Append 'em if we've got 'em.
2148                                                        if (!previousTokenEncoded && startWhiteSpace != -1) {
2149                                                                decodedText.append(text, startWhiteSpace, endWhiteSpace);
2150                                                                startWhiteSpace = -1;
2151                                                        }
2152                                                        // this is definitely a decoded token.
2153                                                        previousTokenEncoded = true;
2154                                                        // and add this to the text.
2155                                                        decodedText.append(decodedWord);
2156                                                        // we continue parsing from here...we allow parsing errors to fall through
2157                                                        // and get handled as normal text.
2158                                                        continue;
2159
2160                                                } catch (final ParseException ignored) {
2161                                                        // just ignore it, skip to next word
2162                                                }
2163                                        }
2164                                        // this is a normal token, so it doesn't matter what the previous token was. Add the white space
2165                                        // if we have it.
2166                                        if (startWhiteSpace != -1) {
2167                                                decodedText.append(text, startWhiteSpace, endWhiteSpace);
2168                                                startWhiteSpace = -1;
2169                                        }
2170                                        // this is not a decoded token.
2171                                        previousTokenEncoded = false;
2172                                        decodedText.append(word);
2173                                }
2174                        }
2175
2176                        return decodedText.toString();
2177                }
2178
2179                /**
2180                 * Decodes a string using the RFC 2047 rules for an "encoded-word" type. This encoding has the syntax:
2181                 * <p>
2182                 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
2183                 *
2184                 * @param word The possibly encoded word value.
2185                 * @return The decoded word.
2186                 * @throws ParseException               in case of a parse error of the RFC 2047.
2187                 * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found.
2188                 */
2189                private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
2190                        // encoded words start with the characters "=?". If this not an encoded word, we throw a
2191                        // ParseException for the caller.
2192
2193                        final var etmPos = word.indexOf(ENCODED_TOKEN_MARKER);
2194                        if (etmPos != 0) {
2195                                throw new ParseException("Invalid RFC 2047 encoded-word: " + word, etmPos);
2196                        }
2197
2198                        final var charsetPos = word.indexOf('?', 2);
2199                        if (charsetPos == -1) {
2200                                throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word, charsetPos);
2201                        }
2202
2203                        // pull out the character set information (this is the MIME name at this point).
2204                        final var charset = word.substring(2, charsetPos).toLowerCase(Locale.ENGLISH);
2205
2206                        // now pull out the encoding token the same way.
2207                        final var encodingPos = word.indexOf('?', charsetPos + 1);
2208                        if (encodingPos == -1) {
2209                                throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word, encodingPos);
2210                        }
2211
2212                        final var encoding = word.substring(charsetPos + 1, encodingPos);
2213
2214                        // and finally the encoded text.
2215                        final var encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
2216                        if (encodedTextPos == -1) {
2217                                throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word, encodedTextPos);
2218                        }
2219
2220                        final var encodedText = word.substring(encodingPos + 1, encodedTextPos);
2221
2222                        // seems a bit silly to encode a null string, but easy to deal with.
2223                        if (encodedText.isEmpty()) {
2224                                return "";
2225                        }
2226
2227                        try {
2228                                // the decoder writes directly to an output stream.
2229                                final var out = new ByteArrayOutputStream(encodedText.length());
2230
2231                                final var encodedData = encodedText.getBytes(StandardCharsets.US_ASCII);
2232
2233                                // Base64 encoded?
2234                                if (encoding.equals(BASE64_ENCODING_MARKER)) {
2235                                        out.write(Base64.getMimeDecoder().decode(encodedData));
2236                                } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
2237                                        QuotedPrintableDecoder.decode(encodedData, out);
2238                                } else {
2239                                        throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
2240                                }
2241                                // get the decoded byte data and convert into a string.
2242                                final var decodedData = out.toByteArray();
2243                                return new String(decodedData, javaCharset(charset));
2244                        } catch (final IOException e) {
2245                                throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
2246                        }
2247                }
2248
2249                /**
2250                 * Translate a MIME standard character set name into the Java equivalent.
2251                 *
2252                 * @param charset The MIME standard name.
2253                 * @return The Java equivalent for this name.
2254                 */
2255                private static String javaCharset(final String charset) {
2256                        // nothing in, nothing out.
2257                        if (charset == null) {
2258                                return null;
2259                        }
2260                        final var mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ENGLISH));
2261                        // if there is no mapping, then the original name is used. Many of the MIME character set
2262                        // names map directly back into Java. The reverse isn't necessarily true.
2263                        return mappedCharset == null ? charset : mappedCharset;
2264                }
2265
2266                /**
2267                 * Hidden constructor, this class must not be instantiated.
2268                 */
2269                private MimeUtils() {
2270                        // do nothing
2271                }
2272
2273        }
2274
2275        protected final class QuotedPrintableDecoder {
2276
2277                /**
2278                 * The shift value required to create the upper nibble from the first of 2 byte values converted from ASCII hex.
2279                 */
2280                private static final int UPPER_NIBBLE_SHIFT = Byte.SIZE / 2;
2281
2282                /**
2283                 * Decodes the encoded byte data writing it to the given output stream.
2284                 *
2285                 * @param data The array of byte data to decode.
2286                 * @param out  The output stream used to return the decoded data.
2287                 * @return the number of bytes produced.
2288                 * @throws IOException if an IO error occurs
2289                 */
2290                public static int decode(final byte[] data, final OutputStream out) throws IOException {
2291                        var off = 0;
2292                        final var length = data.length;
2293                        final var endOffset = off + length;
2294                        var bytesWritten = 0;
2295
2296                        while (off < endOffset) {
2297                                final var ch = data[off++];
2298
2299                                // space characters were translated to '_' on encode, so we need to translate them back.
2300                                if (ch == '_') {
2301                                        out.write(' ');
2302                                } else if (ch == '=') {
2303                                        // we found an encoded character. Reduce the 3 char sequence to one.
2304                                        // but first, make sure we have two characters to work with.
2305                                        if (off + 1 >= endOffset) {
2306                                                throw new IOException("Invalid quoted printable encoding; truncated escape sequence");
2307                                        }
2308
2309                                        final var b1 = data[off++];
2310                                        final var b2 = data[off++];
2311
2312                                        // we've found an encoded carriage return. The next char needs to be a newline
2313                                        if (b1 == '\r') {
2314                                                if (b2 != '\n') {
2315                                                        throw new IOException("Invalid quoted printable encoding; CR must be followed by LF");
2316                                                }
2317                                                // this was a soft linebreak inserted by the encoding. We just toss this away
2318                                                // on decode.
2319                                        } else {
2320                                                // this is a hex pair we need to convert back to a single byte.
2321                                                final var c1 = hexToBinary(b1);
2322                                                final var c2 = hexToBinary(b2);
2323                                                out.write(c1 << UPPER_NIBBLE_SHIFT | c2);
2324                                                // 3 bytes in, one byte out
2325                                                bytesWritten++;
2326                                        }
2327                                } else {
2328                                        // simple character, just write it out.
2329                                        out.write(ch);
2330                                        bytesWritten++;
2331                                }
2332                        }
2333
2334                        return bytesWritten;
2335                }
2336
2337                /**
2338                 * Converts a hexadecimal digit to the binary value it represents.
2339                 *
2340                 * @param b the ASCII hexadecimal byte to convert (0-0, A-F, a-f)
2341                 * @return the int value of the hexadecimal byte, 0-15
2342                 * @throws IOException if the byte is not a valid hexadecimal digit.
2343                 */
2344                private static int hexToBinary(final byte b) throws IOException {
2345                        // CHECKSTYLE IGNORE MagicNumber FOR NEXT 1 LINE
2346                        final var i = Character.digit((char) b, 16);
2347                        if (i == -1) {
2348                                throw new IOException("Invalid quoted printable encoding: not a valid hex digit: " + b);
2349                        }
2350                        return i;
2351                }
2352
2353                /**
2354                 * Hidden constructor, this class must not be instantiated.
2355                 */
2356                private QuotedPrintableDecoder() {
2357                        // do nothing
2358                }
2359
2360        }
2361
2362        // *** END commons-fileupload source ***
2363
2364        // For HTML-Unescaper below, see https://gist.github.com/MarkJeronimus/798c452582e64410db769933ec71cfb7
2365
2366        // *** START HTML-Unescaper source ***
2367
2368        /**
2369         * HTML Un-escaper by Nick Frolov.
2370         * <p>
2371         * With improvement suggested by Axel Dörfler.
2372         * <p>
2373         * Replaced character map with HTML5 characters from<a href="https://www.w3schools.com/charsets/ref_html_entities_a.asp">
2374         * https://www.w3schools.com/charsets/ref_html_entities_a.asp</a>
2375         *
2376         * @author Nick Frolov, Mark Jeronimus
2377         */
2378// Created 2020-06-22
2379        protected static class HTMLUtilities {
2380                // Tables optimized for smallest .class size (without resorting to compression)
2381                private static final String[] NAMES =
2382                                {"excl", "quot", "num", "dollar", "percnt", "amp", "apos", "lpar", "rpar", "ast", "midast", "plus", "comma",
2383                                                "period", "sol", "colon", "semi", "lt", "equals", "GT", "quest", "commat", "lbrack", "lsqb", "bsol",
2384                                                "rbrack", "rsqb", "Hat", "lowbar", "UnderBar", "DiacriticalGrave", "grave", "lbrace", "lcub", "verbar",
2385                                                "vert", "VerticalLine", "rbrace", "rcub", "nbsp", "NonBreakingSpace", "iexcl", "cent", "pound", "curren",
2386                                                "yen", "brvbar", "sect", "die", "Dot", "DoubleDot", "uml", "copy", "ordf", "laquo", "not", "shy",
2387                                                "circledR", "reg", "macr", "strns", "deg", "plusmn", "pm", "sup2", "sup3", "acute", "DiacriticalAcute",
2388                                                "micro", "para", "CenterDot", "centerdot", "middot", "cedil", "Cedilla", "sup1", "ordm", "raquo", "frac14",
2389                                                "frac12", "half", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "angst", "Aring",
2390                                                "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH",
2391                                                "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc",
2392                                                "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig",
2393                                                "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde",
2394                                                "ograve", "oacute", "ocirc", "otilde", "ouml", "div", "divide", "oslash", "ugrave", "uacute", "ucirc",
2395                                                "uuml", "yacute", "thorn", "yuml", "Amacr", "amacr", "Abreve", "abreve", "Aogon", "aogon", "Cacute",
2396                                                "cacute", "Ccirc", "ccirc", "Cdot", "cdot", "Ccaron", "ccaron", "Dcaron", "dcaron", "Dstrok", "dstrok",
2397                                                "Emacr", "emacr", "Edot", "edot", "Eogon", "eogon", "Ecaron", "ecaron", "Gcirc", "gcirc", "Gbreve",
2398                                                "gbreve", "Gdot", "gdot", "Gcedil", "Hcirc", "hcirc", "Hstrok", "hstrok", "Itilde", "itilde", "Imacr",
2399                                                "imacr", "Iogon", "iogon", "Idot", "imath", "inodot", "IJlig", "ijlig", "Jcirc", "jcirc", "Kcedil",
2400                                                "kcedil", "kgreen", "Lacute", "lacute", "Lcedil", "lcedil", "Lcaron", "lcaron", "Lmidot", "lmidot",
2401                                                "Lstrok", "lstrok", "Nacute", "nacute", "Ncedil", "ncedil", "Ncaron", "ncaron", "napos", "ENG", "eng",
2402                                                "Omacr", "omacr", "Odblac", "odblac", "OElig", "oelig", "Racute", "racute", "Rcedil", "rcedil", "Rcaron",
2403                                                "rcaron", "Sacute", "sacute", "Scirc", "scirc", "Scedil", "scedil", "Scaron", "scaron", "Tcedil", "tcedil",
2404                                                "Tcaron", "tcaron", "Tstrok", "tstrok", "Utilde", "utilde", "Umacr", "umacr", "Ubreve", "ubreve", "Uring",
2405                                                "uring", "Udblac", "udblac", "Uogon", "uogon", "Wcirc", "wcirc", "Ycirc", "ycirc", "Yuml", "Zacute",
2406                                                "zacute", "Zdot", "zdot", "Zcaron", "zcaron", "fnof", "imped", "gacute", "jmath", "circ", "caron", "Hacek",
2407                                                "Breve", "breve", "DiacriticalDot", "dot", "ring", "ogon", "DiacriticalTilde", "tilde", "dblac",
2408                                                "DiacriticalDoubleAcute", "DownBreve", "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta",
2409                                                "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", "Sigma", "Tau", "Upsilon",
2410                                                "Phi", "Chi", "Psi", "ohm", "Omega", "alpha", "beta", "gamma", "delta", "epsi", "epsilon", "zeta", "eta",
2411                                                "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigmaf", "sigmav",
2412                                                "varsigma", "sigma", "tau", "upsi", "upsilon", "phi", "chi", "psi", "omega", "thetasym", "thetav",
2413                                                "vartheta", "Upsi", "upsih", "phiv", "straightphi", "varphi", "piv", "varpi", "Gammad", "digamma",
2414                                                "gammad", "kappav", "varkappa", "rhov", "varrho", "epsiv", "straightepsilon", "varepsilon", "backepsilon",
2415                                                "bepsi", "IOcy", "DJcy", "GJcy", "Jukcy", "DScy", "Iukcy", "YIcy", "Jsercy", "LJcy", "NJcy", "TSHcy",
2416                                                "KJcy", "Ubrcy", "DZcy", "Acy", "Bcy", "Vcy", "Gcy", "Dcy", "IEcy", "ZHcy", "Zcy", "Icy", "Jcy", "Kcy",
2417                                                "Lcy", "Mcy", "Ncy", "Ocy", "Pcy", "Rcy", "Scy", "Tcy", "Ucy", "Fcy", "KHcy", "TScy", "CHcy", "SHcy",
2418                                                "SHCHcy", "HARDcy", "Ycy", "SOFTcy", "Ecy", "YUcy", "YAcy", "acy", "bcy", "vcy", "gcy", "dcy", "iecy",
2419                                                "zhcy", "zcy", "icy", "jcy", "kcy", "lcy", "mcy", "ncy", "ocy", "pcy", "rcy", "scy", "tcy", "ucy", "fcy",
2420                                                "khcy", "tscy", "chcy", "shcy", "shchcy", "hardcy", "ycy", "softcy", "ecy", "yucy", "yacy", "iocy", "djcy",
2421                                                "gjcy", "jukcy", "dscy", "iukcy", "yicy", "jsercy", "ljcy", "njcy", "tshcy", "kjcy", "ubrcy", "dzcy",
2422                                                "ensp", "emsp", "emsp13", "emsp14", "numsp", "puncsp", "thinsp", "ThinSpace", "hairsp", "VeryThinSpace",
2423                                                "ZeroWidthSpace", "zwnj", "zwj", "lrm", "rlm", "dash", "hyphen", "ndash", "mdash", "horbar", "Verbar",
2424                                                "Vert", "lsquo", "OpenCurlyQuote", "CloseCurlyQuote", "rsquo", "rsquor", "lsquor", "sbquo", "ldquo",
2425                                                "OpenCurlyDoubleQuote", "CloseCurlyDoubleQuote", "rdquo", "rdquor", "bdquo", "ldquor", "dagger", "ddagger",
2426                                                "bull", "bullet", "nldr", "hellip", "mldr", "permil", "pertenk", "prime", "Prime", "tprime", "backprime",
2427                                                "bprime", "lsaquo", "rsaquo", "oline", "OverBar", "caret", "hybull", "frasl", "bsemi", "qprime",
2428                                                "MediumSpace", "NoBreak", "af", "ApplyFunction", "InvisibleTimes", "it", "ic", "InvisibleComma", "euro",
2429                                                "tdot", "TripleDot", "DotDot", "complexes", "Copf", "incare", "gscr", "hamilt", "HilbertSpace", "Hscr",
2430                                                "Hfr", "Poincareplane", "Hopf", "quaternions", "planckh", "hbar", "hslash", "planck", "plankv", "imagline",
2431                                                "Iscr", "Ifr", "Im", "image", "imagpart", "lagran", "Laplacetrf", "Lscr", "ell", "naturals", "Nopf",
2432                                                "numero", "copysr", "weierp", "wp", "Popf", "primes", "Qopf", "rationals", "realine", "Rscr", "Re", "real",
2433                                                "realpart", "Rfr", "reals", "Ropf", "rx", "TRADE", "trade", "integers", "Zopf", "mho", "zeetrf", "Zfr",
2434                                                "iiota", "bernou", "Bernoullis", "Bscr", "Cayleys", "Cfr", "escr", "Escr", "expectation", "Fouriertrf",
2435                                                "Fscr", "Mellintrf", "Mscr", "phmmat", "order", "orderof", "oscr", "alefsym", "aleph", "beth", "gimel",
2436                                                "daleth", "CapitalDifferentialD", "DD", "dd", "DifferentialD", "ee", "ExponentialE", "exponentiale", "ii",
2437                                                "ImaginaryI", "frac13", "frac23", "frac15", "frac25", "frac35", "frac45", "frac16", "frac56", "frac18",
2438                                                "frac38", "frac58", "frac78", "larr", "LeftArrow", "leftarrow", "ShortLeftArrow", "slarr", "ShortUpArrow",
2439                                                "uarr", "UpArrow", "uparrow", "rarr", "RightArrow", "rightarrow", "ShortRightArrow", "srarr", "darr",
2440                                                "DownArrow", "downarrow", "ShortDownArrow", "harr", "LeftRightArrow", "leftrightarrow", "UpDownArrow",
2441                                                "updownarrow", "varr", "nwarr", "nwarrow", "UpperLeftArrow", "nearr", "nearrow", "UpperRightArrow",
2442                                                "LowerRightArrow", "searr", "searrow", "LowerLeftArrow", "swarr", "swarrow", "nlarr", "nleftarrow",
2443                                                "nrarr", "nrightarrow", "rarrw", "rightsquigarrow", "Larr", "twoheadleftarrow", "Uarr", "Rarr",
2444                                                "twoheadrightarrow", "Darr", "larrtl", "leftarrowtail", "rarrtl", "rightarrowtail", "LeftTeeArrow",
2445                                                "mapstoleft", "mapstoup", "UpTeeArrow", "map", "mapsto", "RightTeeArrow", "DownTeeArrow", "mapstodown",
2446                                                "hookleftarrow", "larrhk", "hookrightarrow", "rarrhk", "larrlp", "looparrowleft", "looparrowright",
2447                                                "rarrlp", "harrw", "leftrightsquigarrow", "nharr", "nleftrightarrow", "Lsh", "lsh", "Rsh", "rsh", "ldsh",
2448                                                "rdsh", "crarr", "cularr", "curvearrowleft", "curarr", "curvearrowright", "circlearrowleft", "olarr",
2449                                                "circlearrowright", "orarr", "leftharpoonup", "LeftVector", "lharu", "DownLeftVector", "leftharpoondown",
2450                                                "lhard", "RightUpVector", "uharr", "upharpoonright", "LeftUpVector", "uharl", "upharpoonleft", "rharu",
2451                                                "rightharpoonup", "RightVector", "DownRightVector", "rhard", "rightharpoondown", "dharr",
2452                                                "downharpoonright", "RightDownVector", "dharl", "downharpoonleft", "LeftDownVector", "RightArrowLeftArrow",
2453                                                "rightleftarrows", "rlarr", "udarr", "UpArrowDownArrow", "LeftArrowRightArrow", "leftrightarrows", "lrarr",
2454                                                "leftleftarrows", "llarr", "upuparrows", "uuarr", "rightrightarrows", "rrarr", "ddarr", "downdownarrows",
2455                                                "leftrightharpoons", "lrhar", "ReverseEquilibrium", "Equilibrium", "rightleftharpoons", "rlhar", "nlArr",
2456                                                "nLeftarrow", "nhArr", "nLeftrightarrow", "nrArr", "nRightarrow", "DoubleLeftArrow", "lArr", "Leftarrow",
2457                                                "DoubleUpArrow", "uArr", "Uparrow", "DoubleRightArrow", "Implies", "rArr", "Rightarrow", "dArr",
2458                                                "DoubleDownArrow", "Downarrow", "DoubleLeftRightArrow", "hArr", "iff", "Leftrightarrow",
2459                                                "DoubleUpDownArrow", "Updownarrow", "vArr", "nwArr", "neArr", "seArr", "swArr", "lAarr", "Lleftarrow",
2460                                                "rAarr", "Rrightarrow", "zigrarr", "larrb", "LeftArrowBar", "rarrb", "RightArrowBar", "DownArrowUpArrow",
2461                                                "duarr", "loarr", "roarr", "hoarr", "ForAll", "forall", "comp", "complement", "part", "PartialD", "Exists",
2462                                                "exist", "nexist", "nexists", "NotExists", "empty", "emptyset", "emptyv", "varnothing", "Del", "nabla",
2463                                                "Element", "in", "isin", "isinv", "NotElement", "notin", "notinva", "ni", "niv", "ReverseElement",
2464                                                "SuchThat", "notni", "notniva", "NotReverseElement", "prod", "Product", "coprod", "Coproduct", "Sum",
2465                                                "sum", "minus", "MinusPlus", "mnplus", "mp", "dotplus", "plusdo", "Backslash", "setminus", "setmn",
2466                                                "smallsetminus", "ssetmn", "lowast", "compfn", "SmallCircle", "radic", "Sqrt", "prop", "Proportional",
2467                                                "propto", "varpropto", "vprop", "infin", "angrt", "ang", "angle", "angmsd", "measuredangle", "angsph",
2468                                                "mid", "shortmid", "smid", "VerticalBar", "nmid", "NotVerticalBar", "nshortmid", "nsmid",
2469                                                "DoubleVerticalBar", "par", "parallel", "shortparallel", "spar", "NotDoubleVerticalBar", "npar",
2470                                                "nparallel", "nshortparallel", "nspar", "and", "wedge", "or", "vee", "cap", "cup", "int", "Integral",
2471                                                "Int", "iiint", "tint", "conint", "ContourIntegral", "oint", "Conint", "DoubleContourIntegral", "Cconint",
2472                                                "cwint", "cwconint", "ClockwiseContourIntegral", "cwconint", "awconint", "there4", "Therefore",
2473                                                "therefore", "because", "ratio", "Colon", "Proportion", "dotminus", "minusd", "mDDot", "homtht", "sim",
2474                                                "thicksim", "thksim", "Tilde", "backsim", "bsim", "ac", "mstpos", "acd", "VerticalTilde", "wr", "wreath",
2475                                                "NotTilde", "nsim", "eqsim", "EqualTilde", "esim", "sime", "simeq", "TildeEqual", "NotTildeEqual", "nsime",
2476                                                "nsimeq", "cong", "TildeFullEqual", "simne", "ncong", "NotTildeFullEqual", "ap", "approx", "asymp",
2477                                                "thickapprox", "thkap", "TildeTilde", "nap", "napprox", "NotTildeTilde", "ape", "approxeq", "apid",
2478                                                "backcong", "bcong", "asympeq", "CupCap", "bump", "Bumpeq", "HumpDownHump", "bumpe", "bumpeq", "HumpEqual",
2479                                                "doteq", "DotEqual", "esdot", "doteqdot", "eDot", "efDot", "fallingdotseq", "erDot", "risingdotseq",
2480                                                "Assign", "colone", "coloneq", "ecolon", "eqcolon", "ecir", "eqcirc", "circeq", "cire", "wedgeq", "veeeq",
2481                                                "triangleq", "trie", "equest", "questeq", "ne", "NotEqual", "Congruent", "equiv", "nequiv", "NotCongruent",
2482                                                "le", "leq", "ge", "geq", "GreaterEqual", "lE", "leqq", "LessFullEqual", "gE", "geqq", "GreaterFullEqual",
2483                                                "lnE", "lneqq", "gnE", "gneqq", "ll", "Lt", "NestedLessLess", "gg", "Gt", "NestedGreaterGreater",
2484                                                "between", "twixt", "NotCupCap", "nless", "nlt", "NotLess", "ngt", "ngtr", "NotGreater", "nle", "nleq",
2485                                                "NotLessEqual", "nge", "ngeq", "NotGreaterEqual", "lesssim", "LessTilde", "lsim", "GreaterTilde", "gsim",
2486                                                "gtrsim", "nlsim", "NotLessTilde", "ngsim", "NotGreaterTilde", "LessGreater", "lessgtr", "lg", "gl",
2487                                                "GreaterLess", "gtrless", "NotLessGreater", "ntlg", "NotGreaterLess", "ntgl", "pr", "prec", "Precedes",
2488                                                "sc", "succ", "Succeeds", "prcue", "preccurlyeq", "PrecedesSlantEqual", "sccue", "succcurlyeq",
2489                                                "SucceedsSlantEqual", "PrecedesTilde", "precsim", "prsim", "scsim", "SucceedsTilde", "succsim",
2490                                                "NotPrecedes", "npr", "nprec", "NotSucceeds", "nsc", "nsucc", "sub", "subset", "sup", "Superset", "supset",
2491                                                "nsub", "nsup", "sube", "subseteq", "SubsetEqual", "supe", "SupersetEqual", "supseteq", "NotSubsetEqual",
2492                                                "nsube", "nsubseteq", "NotSupersetEqual", "nsupe", "nsupseteq", "subne", "subsetneq", "supne", "supsetneq",
2493                                                "cupdot", "UnionPlus", "uplus", "sqsub", "sqsubset", "SquareSubset", "sqsup", "sqsupset", "SquareSuperset",
2494                                                "sqsube", "sqsubseteq", "SquareSubsetEqual", "sqsupe", "sqsupseteq", "SquareSupersetEqual", "sqcap",
2495                                                "SquareIntersection", "sqcup", "SquareUnion", "CirclePlus", "oplus", "CircleMinus", "ominus",
2496                                                "CircleTimes", "otimes", "osol", "CircleDot", "odot", "circledcirc", "ocir", "circledast", "oast",
2497                                                "circleddash", "odash", "boxplus", "plusb", "boxminus", "minusb", "boxtimes", "timesb", "dotsquare",
2498                                                "sdotb", "RightTee;", "vdash", "dashv", "LeftTee", "DownTee", "top", "bot", "bottom", "perp", "UpTee",
2499                                                "models", "DoubleRightTee", "vDash", "Vdash", "Vvdash", "VDash", "nvdash", "nvDash", "nVdash", "nVDash",
2500                                                "prurel", "LeftTriangle", "vartriangleleft", "vltri", "RightTriangle", "vartriangleright", "vrtri",
2501                                                "LeftTriangleEqual", "ltrie", "trianglelefteq", "RightTriangleEqual", "rtrie", "trianglerighteq", "origof",
2502                                                "imof", "multimap", "mumap", "hercon", "intcal", "intercal", "veebar", "barvee", "angrtvb", "lrtri",
2503                                                "bigwedge", "Wedge", "xwedge", "bigvee", "Vee", "xvee", "bigcap", "Intersection", "xcap", "bigcup",
2504                                                "Union", "xcup", "diam", "Diamond", "diamond", "sdot", "sstarf", "Star", "divideontimes", "divonx",
2505                                                "bowtie", "ltimes", "rtimes", "leftthreetimes", "lthree", "rightthreetimes", "rthree", "backsimeq",
2506                                                "bsime", "curlyvee", "cuvee", "curlywedge", "cuwed", "Sub", "Subset", "Sup", "Supset", "Cap", "Cup",
2507                                                "fork", "pitchfork", "epar", "lessdot", "ltdot", "gtdot", "gtrdot", "Ll", "Gg", "ggg", "leg", "lesseqgtr",
2508                                                "LessEqualGreater", "gel", "GreaterEqualLess", "gtreqless", "cuepr", "curlyeqprec", "cuesc", "curlyeqsucc",
2509                                                "NotPrecedesSlantEqual", "nprcue", "NotSucceedsSlantEqual", "nsccue", "NotSquareSubsetEqual", "nsqsube",
2510                                                "NotSquareSupersetEqual", "nsqsupe", "lnsim", "gnsim", "precnsim", "prnsim", "scnsim", "succnsim", "nltri",
2511                                                "NotLeftTriangle", "ntriangleleft", "NotRightTriangle", "nrtri", "ntriangleright", "nltrie",
2512                                                "NotLeftTriangleEqual", "ntrianglelefteq", "NotRightTriangleEqual", "nrtrie", "ntrianglerighteq", "vellip",
2513                                                "ctdot", "utdot", "dtdot", "disin", "isinsv", "isins", "isindot", "notinvc", "notinvb", "isinE", "nisd",
2514                                                "xnis", "nis", "notnivc", "notnivb", "barwedge", "doublebarwedge", "lceil", "LeftCeiling", "rceil",
2515                                                "RightCeiling", "LeftFloor", "lfloor", "rfloor", "RightFloor", "drcrop", "dlcrop", "urcrop", "ulcrop",
2516                                                "bnot", "profline", "profsurf", "telrec", "target", "ulcorn", "ulcorner", "urcorn", "urcorner", "dlcorn",
2517                                                "llcorner", "drcorn", "lrcorner", "frown", "sfrown", "smile", "ssmile", "cylcty", "profalar", "topbot",
2518                                                "ovbar", "solbar", "angzarr", "lmoust", "lmoustache", "rmoust", "rmoustache", "OverBracket", "tbrk",
2519                                                "bbrk", "UnderBracket", "bbrktbrk", "OverParenthesis", "UnderParenthesis", "OverBrace", "UnderBrace",
2520                                                "trpezium", "elinters", "blank", "circledS", "oS", "boxh", "HorizontalLine", "boxv", "boxdr", "boxdl",
2521                                                "boxur", "boxul", "boxvr", "boxvl", "boxhd", "boxhu", "boxvh", "boxH", "boxV", "boxdR", "boxDr", "boxDR",
2522                                                "boxdL", "boxDl", "boxDL", "boxuR", "boxUr", "boxUR", "boxuL", "boxUl", "boxUL", "boxvR", "boxVr", "boxVR",
2523                                                "boxvL", "boxVl", "boxVL", "boxHd", "boxhD", "boxHD", "boxHu", "boxhU", "boxHU", "boxvH", "boxVh", "boxVH",
2524                                                "uhblk", "lhblk", "block", "blk14", "blk12", "blk34", "squ", "Square", "square", "blacksquare",
2525                                                "FilledVerySmallSquare", "squarf", "squf", "EmptyVerySmallSquare", "rect", "marker", "fltns",
2526                                                "bigtriangleup", "xutri", "blacktriangle", "utrif", "triangle", "utri", "blacktriangleright", "rtrif",
2527                                                "rtri", "triangleright", "bigtriangledown", "xdtri", "blacktriangledown", "dtrif", "dtri", "triangledown",
2528                                                "blacktriangleleft", "ltrif", "ltri", "triangleleft", "loz", "lozenge", "cir", "tridot", "bigcirc",
2529                                                "xcirc", "ultri", "urtri", "lltri", "EmptySmallSquare", "FilledSmallSquare", "bigstar", "starf", "star",
2530                                                "phone", "female", "male", "spades", "spadesuit", "clubs", "clubsuit", "hearts", "heartsuit",
2531                                                "diamondsuit", "diams", "sung", "flat", "natur", "natural", "sharp", "check", "checkmark", "cross", "malt",
2532                                                "maltese", "sext", "VerticalSeparator", "lbbrk", "rbbrk", "bsolhsub", "suphsol", "LeftDoubleBracket",
2533                                                "lobrk", "RightDoubleBracket", "robrk", "lang", "langle", "LeftAngleBracket", "rang", "rangle",
2534                                                "RightAngleBracket", "Lang", "Rang", "loang", "roang", "LongLeftArrow", "longleftarrow", "xlarr",
2535                                                "LongRightArrow", "longrightarrow", "xrarr", "LongLeftRightArrow", "longleftrightarrow", "xharr",
2536                                                "DoubleLongLeftArrow", "Longleftarrow", "xlArr", "DoubleLongRightArrow", "Longrightarrow", "xrArr",
2537                                                "DoubleLongLeftRightArrow", "Longleftrightarrow", "xhArr", "longmapsto", "xmap", "dzigrarr", "nvlArr",
2538                                                "nvrArr", "nvHarr", "Map", "lbarr", "bkarow", "rbarr", "lBarr", "dbkarow", "rBarr", "drbkarow", "RBarr",
2539                                                "DDotrahd", "UpArrowBar", "DownArrowBar", "Rarrtl", "latail", "ratail", "lAtail", "rAtail", "larrfs",
2540                                                "rarrfs", "larrbfs", "rarrbfs", "nwarhk", "nearhk", "hksearow", "searhk", "hkswarow", "swarhk", "nwnear",
2541                                                "nesear", "toea", "seswar", "tosa", "swnwar", "rarrc", "cudarrr", "ldca", "rdca", "cudarrl", "larrpl",
2542                                                "curarrm", "cularrp", "rarrpl", "harrcir", "Uarrocir", "lurdshar", "ldrushar", "LeftRightVector",
2543                                                "RightUpDownVector", "DownLeftRightVector", "LeftUpDownVector", "LeftVectorBar", "RightVectorBar",
2544                                                "RightUpVectorBar", "RightDownVectorBar", "DownLeftVectorBar", "DownRightVectorBar", "LeftUpVectorBar",
2545                                                "LeftDownVectorBar", "LeftTeeVector", "RightTeeVector", "RightUpTeeVector", "RightDownTeeVector",
2546                                                "DownLeftTeeVector", "DownRightTeeVector", "LeftUpTeeVector", "LeftDownTeeVector", "lHar", "uHar", "rHar",
2547                                                "dHar", "luruhar", "ldrdhar", "ruluhar", "rdldhar", "lharul", "llhard", "rharul", "lrhard", "udhar",
2548                                                "UpEquilibrium", "duhar", "ReverseUpEquilibrium", "RoundImplies", "erarr", "simrarr", "larrsim", "rarrsim",
2549                                                "rarrap", "ltlarr", "gtrarr", "subrarr", "suplarr", "lfisht", "rfisht", "ufisht", "dfisht", "lopar",
2550                                                "ropar", "lbrke", "rbrke", "lbrkslu", "rbrksld", "lbrksld", "rbrkslu", "langd", "rangd", "lparlt",
2551                                                "rpargt", "gtlPar", "ltrPar", "vzigzag", "vangrt", "angrtvbd", "ange", "range", "dwangle", "uwangle",
2552                                                "angmsdaa", "angmsdab", "angmsdac", "angmsdad", "angmsdae", "angmsdaf", "angmsdag", "angmsdah", "bemptyv",
2553                                                "demptyv", "cemptyv", "raemptyv", "laemptyv", "ohbar", "omid", "opar", "operp", "olcross", "odsold",
2554                                                "olcir", "ofcir", "olt", "ogt", "cirscir", "cirE", "solb", "bsolb", "boxbox", "trisb", "rtriltri",
2555                                                "LeftTriangleBar", "RightTriangleBar", "iinfin", "infintie", "nvinfin", "eparsl", "smeparsl", "eqvparsl",
2556                                                "blacklozenge", "lozf", "RuleDelayed", "dsol", "bigodot", "xodot", "bigoplus", "xoplus", "bigotimes",
2557                                                "xotime", "biguplus", "xuplus", "bigsqcup", "xsqcup", "iiiint", "qint", "fpartint", "cirfnint", "awint",
2558                                                "rppolint", "scpolint", "npolint", "pointint", "quatint", "intlarhk", "pluscir", "plusacir", "simplus",
2559                                                "plusdu", "plussim", "plustwo", "mcomma", "minusdu", "loplus", "roplus", "Cross", "timesd", "timesbar",
2560                                                "smashp", "lotimes", "rotimes", "otimesas", "Otimes", "odiv", "triplus", "triminus", "tritime", "intprod",
2561                                                "iprod", "amalg", "capdot", "ncup", "ncap", "capand", "cupor", "cupcap", "capcup", "cupbrcap", "capbrcup",
2562                                                "cupcup", "capcap", "ccups", "ccaps", "ccupssm", "And", "Or", "andand", "oror", "orslope", "andslope",
2563                                                "andv", "orv", "andd", "ord", "wedbar", "sdote", "simdot", "congdot", "easter", "apacir", "apE", "eplus",
2564                                                "pluse", "Esim", "Colone", "Equal", "ddotseq", "eDDot", "equivDD", "ltcir", "gtcir", "ltquest", "gtquest",
2565                                                "leqslant", "les", "LessSlantEqual", "geqslant", "ges", "GreaterSlantEqual", "lesdot", "gesdot", "lesdoto",
2566                                                "gesdoto", "lesdotor", "gesdotol", "lap", "lessapprox", "gap", "gtrapprox", "lne", "lneq", "gne", "gneq",
2567                                                "lnap", "lnapprox", "gnap", "gnapprox", "lEg", "lesseqqgtr", "gEl", "gtreqqless", "lsime", "gsime",
2568                                                "lsimg", "gsiml", "lgE", "glE", "lesges", "gesles", "els", "eqslantless", "egs", "eqslantgtr", "elsdot",
2569                                                "egsdot", "el", "eg", "siml", "simg", "simlE", "simgE", "LessLess", "GreaterGreater", "glj", "gla", "ltcc",
2570                                                "gtcc", "lescc", "gescc", "smt", "lat", "smte", "late", "bumpE", "pre", "PrecedesEqual", "preceq", "sce",
2571                                                "SucceedsEqual", "succeq", "prE", "scE", "precneqq", "prnE", "scnE", "succneqq", "prap", "precapprox",
2572                                                "scap", "succapprox", "precnapprox", "prnap", "scnap", "succnapprox", "Pr", "Sc", "subdot", "supdot",
2573                                                "subplus", "supplus", "submult", "supmult", "subedot", "supedot", "subE", "subseteqq", "supE", "supseteqq",
2574                                                "subsim", "supsim", "subnE", "subsetneqq", "supnE", "supsetneqq", "csub", "csup", "csube", "csupe",
2575                                                "subsup", "supsub", "subsub", "supsup", "suphsub", "supdsub", "forkv", "topfork", "mlcp", "Dashv",
2576                                                "DoubleLeftTee", "Vdashl", "Barv", "vBar", "vBarv", "Vbar", "Not", "bNot", "rnmid", "cirmid", "midcir",
2577                                                "topcir", "nhpar", "parsim", "parsl", "fflig", "filig", "fllig", "ffilig", "ffllig", "Ascr", "Cscr",
2578                                                "Dscr", "Gscr", "Jscr", "Kscr", "Nscr", "Oscr", "Pscr", "Qscr", "Sscr", "Tscr", "Uscr", "Vscr", "Wscr",
2579                                                "Xscr", "Yscr", "Zscr", "ascr", "bscr", "cscr", "dscr", "fscr", "hscr", "iscr", "jscr", "kscr", "lscr",
2580                                                "mscr", "nscr", "pscr", "qscr", "rscr", "sscr", "tscr", "uscr", "vscr", "wscr", "xscr", "yscr", "zscr",
2581                                                "Afr", "Bfr", "Dfr", "Efr", "Ffr", "Gfr", "Jfr", "Kfr", "Lfr", "Mfr", "Nfr", "Ofr", "Pfr", "Qfr", "Sfr",
2582                                                "Tfr", "Ufr", "Vfr", "Wfr", "Xfr", "Yfr", "afr", "bfr", "cfr", "dfr", "efr", "ffr", "gfr", "hfr", "ifr",
2583                                                "jfr", "kfr", "lfr", "mfr", "nfr", "ofr", "pfr", "qfr", "rfr", "sfr", "tfr", "ufr", "vfr", "wfr", "xfr",
2584                                                "yfr", "zfr", "Aopf", "Bopf", "Dopf", "Eopf", "Fopf", "Gopf", "Iopf", "Jopf", "Kopf", "Lopf", "Mopf",
2585                                                "Oopf", "Sopf", "Topf", "Uopf", "Vopf", "Wopf", "Xopf", "Yopf", "aopf", "bopf", "copf", "dopf", "eopf",
2586                                                "fopf", "gopf", "hopf", "iopf", "jopf", "kopf", "lopf", "mopf", "nopf", "oopf", "popf", "qopf", "ropf",
2587                                                "sopf", "topf", "uopf", "vopf", "wopf", "xopf", "yopf", "zopf", "nvlt", "bne", "nvgt", "fjlig",
2588                                                "ThickSpace", "nrarrw", "npart", "nang", "caps", "cups", "nvsim", "race", "acE", "nesim", "NotEqualTilde",
2589                                                "napid", "nvap", "nbump", "NotHumpDownHump", "nbumpe", "NotHumpEqual", "nedot", "bnequiv", "nvle", "nvge",
2590                                                "nlE", "nleqq", "ngE", "ngeqq", "NotGreaterFullEqual", "lvertneqq", "lvnE", "gvertneqq", "gvnE", "nLtv",
2591                                                "NotLessLess", "nLt", "nGtv", "NotGreaterGreater", "nGt", "NotSucceedsTilde", "NotSubset", "nsubset",
2592                                                "vnsub", "NotSuperset", "nsupset", "vnsup", "varsubsetneq", "vsubne", "varsupsetneq", "vsupne",
2593                                                "NotSquareSubset", "NotSquareSuperset", "sqcaps", "sqcups", "nvltrie", "nvrtrie", "nLl", "nGg", "lesg",
2594                                                "gesl", "notindot", "notinE", "nrarrc", "NotLeftTriangleBar", "NotRightTriangleBar", "ncongdot", "napE",
2595                                                "nleqslant", "nles", "NotLessSlantEqual", "ngeqslant", "nges", "NotGreaterSlantEqual", "NotNestedLessLess",
2596                                                "NotNestedGreaterGreater", "smtes", "lates", "NotPrecedesEqual", "npre", "npreceq", "NotSucceedsEqual",
2597                                                "nsce", "nsucceq", "nsubE", "nsubseteqq", "nsupE", "nsupseteqq", "varsubsetneqq", "vsubnE",
2598                                                "varsupsetneqq", "vsupnE", "nparsl"};
2599                private static final int[] CODEPOINTS =
2600                                {33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 46, 47, 58, 59, 60, 61, 62, 63, 64, 91, 91, 92, 93, 93,
2601                                                94, 95, 95, 96, 96, 123, 123, 124, 124, 124, 125, 125, 160, 160, 161, 162, 163, 164, 165, 166, 167, 168,
2602                                                168, 168, 168, 169, 170, 171, 172, 173, 174, 174, 175, 175, 176, 177, 177, 178, 179, 180, 180, 181, 182,
2603                                                183, 183, 183, 184, 184, 185, 186, 187, 188, 189, 189, 190, 191, 192, 193, 194, 195, 196, 197, 197, 198,
2604                                                199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
2605                                                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240,
2606                                                241, 242, 243, 244, 245, 246, 247, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
2607                                                261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 278, 279, 280, 281, 282, 283,
2608                                                284, 285, 286, 287, 288, 289, 290, 292, 293, 294, 295, 296, 297, 298, 299, 302, 303, 304, 305, 305, 306,
2609                                                307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327,
2610                                                328, 329, 330, 331, 332, 333, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350,
2611                                                351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371,
2612                                                372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 402, 437, 501, 567, 710, 711, 711, 728, 728, 729,
2613                                                729, 730, 731, 732, 732, 733, 733, 785, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925,
2614                                                926, 927, 928, 929, 931, 932, 933, 934, 935, 936, 937, 937, 945, 946, 947, 948, 949, 949, 950, 951, 952,
2615                                                953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 962, 962, 963, 964, 965, 965, 966, 967, 968, 969, 977,
2616                                                977, 977, 978, 978, 981, 981, 981, 982, 982, 988, 989, 989, 1008, 1008, 1009, 1009, 1013, 1013, 1013, 1014,
2617                                                1014, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1038, 1039, 1040, 1041, 1042,
2618                                                1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060,
2619                                                1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078,
2620                                                1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096,
2621                                                1097, 1098, 1099, 1100, 1101, 1102, 1103, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115,
2622                                                1116, 1118, 1119, 8194, 8195, 8196, 8197, 8199, 8200, 8201, 8201, 8202, 8202, 8203, 8204, 8205, 8206, 8207,
2623                                                8208, 8208, 8211, 8212, 8213, 8214, 8214, 8216, 8216, 8217, 8217, 8217, 8218, 8218, 8220, 8220, 8221, 8221,
2624                                                8221, 8222, 8222, 8224, 8225, 8226, 8226, 8229, 8230, 8230, 8240, 8241, 8242, 8243, 8244, 8245, 8245, 8249,
2625                                                8250, 8254, 8254, 8257, 8259, 8260, 8271, 8279, 8287, 8288, 8289, 8289, 8290, 8290, 8291, 8291, 8364, 8411,
2626                                                8411, 8412, 8450, 8450, 8453, 8458, 8459, 8459, 8459, 8460, 8460, 8461, 8461, 8462, 8463, 8463, 8463, 8463,
2627                                                8464, 8464, 8465, 8465, 8465, 8465, 8466, 8466, 8466, 8467, 8469, 8469, 8470, 8471, 8472, 8472, 8473, 8473,
2628                                                8474, 8474, 8475, 8475, 8476, 8476, 8476, 8476, 8477, 8477, 8478, 8482, 8482, 8484, 8484, 8487, 8488, 8488,
2629                                                8489, 8492, 8492, 8492, 8493, 8493, 8495, 8496, 8496, 8497, 8497, 8499, 8499, 8499, 8500, 8500, 8500, 8501,
2630                                                8501, 8502, 8503, 8504, 8517, 8517, 8518, 8518, 8519, 8519, 8519, 8520, 8520, 8531, 8532, 8533, 8534, 8535,
2631                                                8536, 8537, 8538, 8539, 8540, 8541, 8542, 8592, 8592, 8592, 8592, 8592, 8593, 8593, 8593, 8593, 8594, 8594,
2632                                                8594, 8594, 8594, 8595, 8595, 8595, 8595, 8596, 8596, 8596, 8597, 8597, 8597, 8598, 8598, 8598, 8599, 8599,
2633                                                8599, 8600, 8600, 8600, 8601, 8601, 8601, 8602, 8602, 8603, 8603, 8605, 8605, 8606, 8606, 8607, 8608, 8608,
2634                                                8609, 8610, 8610, 8611, 8611, 8612, 8612, 8613, 8613, 8614, 8614, 8614, 8615, 8615, 8617, 8617, 8618, 8618,
2635                                                8619, 8619, 8620, 8620, 8621, 8621, 8622, 8622, 8624, 8624, 8625, 8625, 8626, 8627, 8629, 8630, 8630, 8631,
2636                                                8631, 8634, 8634, 8635, 8635, 8636, 8636, 8636, 8637, 8637, 8637, 8638, 8638, 8638, 8639, 8639, 8639, 8640,
2637                                                8640, 8640, 8641, 8641, 8641, 8642, 8642, 8642, 8643, 8643, 8643, 8644, 8644, 8644, 8645, 8645, 8646, 8646,
2638                                                8646, 8647, 8647, 8648, 8648, 8649, 8649, 8650, 8650, 8651, 8651, 8651, 8652, 8652, 8652, 8653, 8653, 8654,
2639                                                8654, 8655, 8655, 8656, 8656, 8656, 8657, 8657, 8657, 8658, 8658, 8658, 8658, 8659, 8659, 8659, 8660, 8660,
2640                                                8660, 8660, 8661, 8661, 8661, 8662, 8663, 8664, 8665, 8666, 8666, 8667, 8667, 8669, 8676, 8676, 8677, 8677,
2641                                                8693, 8693, 8701, 8702, 8703, 8704, 8704, 8705, 8705, 8706, 8706, 8707, 8707, 8708, 8708, 8708, 8709, 8709,
2642                                                8709, 8709, 8711, 8711, 8712, 8712, 8712, 8712, 8713, 8713, 8713, 8715, 8715, 8715, 8715, 8716, 8716, 8716,
2643                                                8719, 8719, 8720, 8720, 8721, 8721, 8722, 8723, 8723, 8723, 8724, 8724, 8726, 8726, 8726, 8726, 8726, 8727,
2644                                                8728, 8728, 8730, 8730, 8733, 8733, 8733, 8733, 8733, 8734, 8735, 8736, 8736, 8737, 8737, 8738, 8739, 8739,
2645                                                8739, 8739, 8740, 8740, 8740, 8740, 8741, 8741, 8741, 8741, 8741, 8742, 8742, 8742, 8742, 8742, 8743, 8743,
2646                                                8744, 8744, 8745, 8746, 8747, 8747, 8748, 8749, 8749, 8750, 8750, 8750, 8751, 8751, 8752, 8753, 8754, 8754,
2647                                                8754, 8755, 8756, 8756, 8756, 8757, 8758, 8759, 8759, 8760, 8760, 8762, 8763, 8764, 8764, 8764, 8764, 8765,
2648                                                8765, 8766, 8766, 8767, 8768, 8768, 8768, 8769, 8769, 8770, 8770, 8770, 8771, 8771, 8771, 8772, 8772, 8772,
2649                                                8773, 8773, 8774, 8775, 8775, 8776, 8776, 8776, 8776, 8776, 8776, 8777, 8777, 8777, 8778, 8778, 8779, 8780,
2650                                                8780, 8781, 8781, 8782, 8782, 8782, 8783, 8783, 8783, 8784, 8784, 8784, 8785, 8785, 8786, 8786, 8787, 8787,
2651                                                8788, 8788, 8788, 8789, 8789, 8790, 8790, 8791, 8791, 8793, 8794, 8796, 8796, 8799, 8799, 8800, 8800, 8801,
2652                                                8801, 8802, 8802, 8804, 8804, 8805, 8805, 8805, 8806, 8806, 8806, 8807, 8807, 8807, 8808, 8808, 8809, 8809,
2653                                                8810, 8810, 8810, 8811, 8811, 8811, 8812, 8812, 8813, 8814, 8814, 8814, 8815, 8815, 8815, 8816, 8816, 8816,
2654                                                8817, 8817, 8817, 8818, 8818, 8818, 8819, 8819, 8819, 8820, 8820, 8821, 8821, 8822, 8822, 8822, 8823, 8823,
2655                                                8823, 8824, 8824, 8825, 8825, 8826, 8826, 8826, 8827, 8827, 8827, 8828, 8828, 8828, 8829, 8829, 8829, 8830,
2656                                                8830, 8830, 8831, 8831, 8831, 8832, 8832, 8832, 8833, 8833, 8833, 8834, 8834, 8835, 8835, 8835, 8836, 8837,
2657                                                8838, 8838, 8838, 8839, 8839, 8839, 8840, 8840, 8840, 8841, 8841, 8841, 8842, 8842, 8843, 8843, 8845, 8846,
2658                                                8846, 8847, 8847, 8847, 8848, 8848, 8848, 8849, 8849, 8849, 8850, 8850, 8850, 8851, 8851, 8852, 8852, 8853,
2659                                                8853, 8854, 8854, 8855, 8855, 8856, 8857, 8857, 8858, 8858, 8859, 8859, 8861, 8861, 8862, 8862, 8863, 8863,
2660                                                8864, 8864, 8865, 8865, 8866, 8866, 8867, 8867, 8868, 8868, 8869, 8869, 8869, 8869, 8871, 8872, 8872, 8873,
2661                                                8874, 8875, 8876, 8877, 8878, 8879, 8880, 8882, 8882, 8882, 8883, 8883, 8883, 8884, 8884, 8884, 8885, 8885,
2662                                                8885, 8886, 8887, 8888, 8888, 8889, 8890, 8890, 8891, 8893, 8894, 8895, 8896, 8896, 8896, 8897, 8897, 8897,
2663                                                8898, 8898, 8898, 8899, 8899, 8899, 8900, 8900, 8900, 8901, 8902, 8902, 8903, 8903, 8904, 8905, 8906, 8907,
2664                                                8907, 8908, 8908, 8909, 8909, 8910, 8910, 8911, 8911, 8912, 8912, 8913, 8913, 8914, 8915, 8916, 8916, 8917,
2665                                                8918, 8918, 8919, 8919, 8920, 8921, 8921, 8922, 8922, 8922, 8923, 8923, 8923, 8926, 8926, 8927, 8927, 8928,
2666                                                8928, 8929, 8929, 8930, 8930, 8931, 8931, 8934, 8935, 8936, 8936, 8937, 8937, 8938, 8938, 8938, 8939, 8939,
2667                                                8939, 8940, 8940, 8940, 8941, 8941, 8941, 8942, 8943, 8944, 8945, 8946, 8947, 8948, 8949, 8950, 8951, 8953,
2668                                                8954, 8955, 8956, 8957, 8958, 8965, 8966, 8968, 8968, 8969, 8969, 8970, 8970, 8971, 8971, 8972, 8973, 8974,
2669                                                8975, 8976, 8978, 8979, 8981, 8982, 8988, 8988, 8989, 8989, 8990, 8990, 8991, 8991, 8994, 8994, 8995, 8995,
2670                                                9005, 9006, 9014, 9021, 9023, 9084, 9136, 9136, 9137, 9137, 9140, 9140, 9141, 9141, 9142, 9180, 9181, 9182,
2671                                                9183, 9186, 9191, 9251, 9416, 9416, 9472, 9472, 9474, 9484, 9488, 9492, 9496, 9500, 9508, 9516, 9524, 9532,
2672                                                9552, 9553, 9554, 9555, 9556, 9557, 9558, 9559, 9560, 9561, 9562, 9563, 9564, 9565, 9566, 9567, 9568, 9569,
2673                                                9570, 9571, 9572, 9573, 9574, 9575, 9576, 9577, 9578, 9579, 9580, 9600, 9604, 9608, 9617, 9618, 9619, 9633,
2674                                                9633, 9633, 9642, 9642, 9642, 9642, 9643, 9645, 9646, 9649, 9651, 9651, 9652, 9652, 9653, 9653, 9656, 9656,
2675                                                9657, 9657, 9661, 9661, 9662, 9662, 9663, 9663, 9666, 9666, 9667, 9667, 9674, 9674, 9675, 9708, 9711, 9711,
2676                                                9720, 9721, 9722, 9723, 9724, 9733, 9733, 9734, 9742, 9792, 9794, 9824, 9824, 9827, 9827, 9829, 9829, 9830,
2677                                                9830, 9834, 9837, 9838, 9838, 9839, 10003, 10003, 10007, 10016, 10016, 10038, 10072, 10098, 10099, 10184,
2678                                                10185, 10214, 10214, 10215, 10215, 10216, 10216, 10216, 10217, 10217, 10217, 10218, 10219, 10220, 10221,
2679                                                10229, 10229, 10229, 10230, 10230, 10230, 10231, 10231, 10231, 10232, 10232, 10232, 10233, 10233, 10233,
2680                                                10234, 10234, 10234, 10236, 10236, 10239, 10498, 10499, 10500, 10501, 10508, 10509, 10509, 10510, 10511,
2681                                                10511, 10512, 10512, 10513, 10514, 10515, 10518, 10521, 10522, 10523, 10524, 10525, 10526, 10527, 10528,
2682                                                10531, 10532, 10533, 10533, 10534, 10534, 10535, 10536, 10536, 10537, 10537, 10538, 10547, 10549, 10550,
2683                                                10551, 10552, 10553, 10556, 10557, 10565, 10568, 10569, 10570, 10571, 10574, 10575, 10576, 10577, 10578,
2684                                                10579, 10580, 10581, 10582, 10583, 10584, 10585, 10586, 10587, 10588, 10589, 10590, 10591, 10592, 10593,
2685                                                10594, 10595, 10596, 10597, 10598, 10599, 10600, 10601, 10602, 10603, 10604, 10605, 10606, 10606, 10607,
2686                                                10607, 10608, 10609, 10610, 10611, 10612, 10613, 10614, 10616, 10617, 10619, 10620, 10621, 10622, 10623,
2687                                                10629, 10630, 10635, 10636, 10637, 10638, 10639, 10640, 10641, 10642, 10643, 10644, 10645, 10646, 10650,
2688                                                10652, 10653, 10660, 10661, 10662, 10663, 10664, 10665, 10666, 10667, 10668, 10669, 10670, 10671, 10672,
2689                                                10673, 10674, 10675, 10676, 10677, 10678, 10679, 10681, 10683, 10684, 10686, 10687, 10688, 10689, 10690,
2690                                                10691, 10692, 10693, 10697, 10701, 10702, 10703, 10704, 10716, 10717, 10718, 10723, 10724, 10725, 10731,
2691                                                10731, 10740, 10742, 10752, 10752, 10753, 10753, 10754, 10754, 10756, 10756, 10758, 10758, 10764, 10764,
2692                                                10765, 10768, 10769, 10770, 10771, 10772, 10773, 10774, 10775, 10786, 10787, 10788, 10789, 10790, 10791,
2693                                                10793, 10794, 10797, 10798, 10799, 10800, 10801, 10803, 10804, 10805, 10806, 10807, 10808, 10809, 10810,
2694                                                10811, 10812, 10812, 10815, 10816, 10818, 10819, 10820, 10821, 10822, 10823, 10824, 10825, 10826, 10827,
2695                                                10828, 10829, 10832, 10835, 10836, 10837, 10838, 10839, 10840, 10842, 10843, 10844, 10845, 10847, 10854,
2696                                                10858, 10861, 10862, 10863, 10864, 10865, 10866, 10867, 10868, 10869, 10871, 10871, 10872, 10873, 10874,
2697                                                10875, 10876, 10877, 10877, 10877, 10878, 10878, 10878, 10879, 10880, 10881, 10882, 10883, 10884, 10885,
2698                                                10885, 10886, 10886, 10887, 10887, 10888, 10888, 10889, 10889, 10890, 10890, 10891, 10891, 10892, 10892,
2699                                                10893, 10894, 10895, 10896, 10897, 10898, 10899, 10900, 10901, 10901, 10902, 10902, 10903, 10904, 10905,
2700                                                10906, 10909, 10910, 10911, 10912, 10913, 10914, 10916, 10917, 10918, 10919, 10920, 10921, 10922, 10923,
2701                                                10924, 10925, 10926, 10927, 10927, 10927, 10928, 10928, 10928, 10931, 10932, 10933, 10933, 10934, 10934,
2702                                                10935, 10935, 10936, 10936, 10937, 10937, 10938, 10938, 10939, 10940, 10941, 10942, 10943, 10944, 10945,
2703                                                10946, 10947, 10948, 10949, 10949, 10950, 10950, 10951, 10952, 10955, 10955, 10956, 10956, 10959, 10960,
2704                                                10961, 10962, 10963, 10964, 10965, 10966, 10967, 10968, 10969, 10970, 10971, 10980, 10980, 10982, 10983,
2705                                                10984, 10985, 10987, 10988, 10989, 10990, 10991, 10992, 10993, 10994, 10995, 11005, 64256, 64257, 64258,
2706                                                64259, 64260, 119964, 119966, 119967, 119970, 119973, 119974, 119977, 119978, 119979, 119980, 119982,
2707                                                119983, 119984, 119985, 119986, 119987, 119988, 119989, 119990, 119991, 119992, 119993, 119995, 119997,
2708                                                119998, 119999, 120000, 120001, 120002, 120003, 120005, 120006, 120007, 120008, 120009, 120010, 120011,
2709                                                120012, 120013, 120014, 120015, 120068, 120069, 120071, 120072, 120073, 120074, 120077, 120078, 120079,
2710                                                120080, 120081, 120082, 120083, 120084, 120086, 120087, 120088, 120089, 120090, 120091, 120092, 120094,
2711                                                120095, 120096, 120097, 120098, 120099, 120100, 120101, 120102, 120103, 120104, 120105, 120106, 120107,
2712                                                120108, 120109, 120110, 120111, 120112, 120113, 120114, 120115, 120116, 120117, 120118, 120119, 120120,
2713                                                120121, 120123, 120124, 120125, 120126, 120128, 120129, 120130, 120131, 120132, 120134, 120138, 120139,
2714                                                120140, 120141, 120142, 120143, 120144, 120146, 120147, 120148, 120149, 120150, 120151, 120152, 120153,
2715                                                120154, 120155, 120156, 120157, 120158, 120159, 120160, 120161, 120162, 120163, 120164, 120165, 120166,
2716                                                120167, 120168, 120169, 120170, 120171};
2717                private static final long[] COMBINED_DIACRITICALS =
2718                                {0x003C020D2L, 0x003D020E5L, 0x003E020D2L, 0x00660006AL, 0x205F0200AL, 0x219D00338L, 0x220200338L,
2719                                                0x2220020D2L, 0x22290FE00L, 0x222A0FE00L, 0x223C020D2L, 0x223D00331L, 0x223E00333L, 0x224200338L,
2720                                                0x224200338L, 0x224B00338L, 0x224D020D2L, 0x224E00338L, 0x224E00338L, 0x224F00338L, 0x224F00338L,
2721                                                0x225000338L, 0x2261020E5L, 0x2264020D2L, 0x2265020D2L, 0x226600338L, 0x226600338L, 0x226700338L,
2722                                                0x226700338L, 0x226700338L, 0x22680FE00L, 0x22680FE00L, 0x22690FE00L, 0x22690FE00L, 0x226A00338L,
2723                                                0x226A00338L, 0x226A020D2L, 0x226B00338L, 0x226B00338L, 0x226B020D2L, 0x227F00338L, 0x2282020D2L,
2724                                                0x2282020D2L, 0x2282020D2L, 0x2283020D2L, 0x2283020D2L, 0x2283020D2L, 0x228A0FE00L, 0x228A0FE00L,
2725                                                0x228B0FE00L, 0x228B0FE00L, 0x228F00338L, 0x229000338L, 0x22930FE00L, 0x22940FE00L, 0x22B4020D2L,
2726                                                0x22B5020D2L, 0x22D800338L, 0x22D900338L, 0x22DA0FE00L, 0x22DB0FE00L, 0x22F500338L, 0x22F900338L,
2727                                                0x293300338L, 0x29CF00338L, 0x29D000338L, 0x2A6D00338L, 0x2A7000338L, 0x2A7D00338L, 0x2A7D00338L,
2728                                                0x2A7D00338L, 0x2A7E00338L, 0x2A7E00338L, 0x2A7E00338L, 0x2AA100338L, 0x2AA200338L, 0x2AAC0FE00L,
2729                                                0x2AAD0FE00L, 0x2AAF00338L, 0x2AAF00338L, 0x2AAF00338L, 0x2AB000338L, 0x2AB000338L, 0x2AB000338L,
2730                                                0x2AC500338L, 0x2AC500338L, 0x2AC600338L, 0x2AC600338L, 0x2ACB0FE00L, 0x2ACB0FE00L, 0x2ACC0FE00L,
2731                                                0x2ACC0FE00L, 0x2AFD020E5L};
2732
2733                private static final int MIN_ESCAPE;
2734                private static final int MAX_ESCAPE;
2735                private static final HashMap<String, int[]> LOOKUP_MAP;
2736
2737                static {
2738                        int minEscape = Integer.MAX_VALUE;
2739                        int maxEscape = Integer.MIN_VALUE;
2740                        HashMap<String, int[]> lookupMap = new HashMap<>(NAMES.length);
2741
2742                        for (String name : NAMES) {
2743                                minEscape = Math.min(minEscape, name.length());
2744                                maxEscape = Math.max(maxEscape, name.length());
2745                        }
2746
2747                        for (int i = 0; i < CODEPOINTS.length; i++)
2748                                lookupMap.put(NAMES[i], new int[]{CODEPOINTS[i]});
2749
2750                        for (int i = 0; i < COMBINED_DIACRITICALS.length; i++) {
2751                                long combinedDiacritical = COMBINED_DIACRITICALS[i];
2752                                int codepoint1 = (int) (combinedDiacritical >> 20);
2753                                int codepoint2 = (int) (combinedDiacritical & 0xFFFFF);
2754                                lookupMap.put(NAMES[CODEPOINTS.length + i], new int[]{codepoint1, codepoint2});
2755                        }
2756
2757                        MIN_ESCAPE = minEscape;
2758                        MAX_ESCAPE = maxEscape;
2759                        LOOKUP_MAP = lookupMap;
2760                }
2761
2762                public static String unescapeHtml(String input) {
2763                        StringBuilder result = null;
2764
2765                        int len = input.length();
2766                        int start = 0;
2767                        int escStart = 0;
2768                        while (true) {
2769                                // Look for '&'
2770                                while (escStart < len && input.charAt(escStart) != '&')
2771                                        escStart++;
2772
2773                                if (escStart == len)
2774                                        break;
2775
2776                                escStart++;
2777
2778                                // Found '&'. Look for ';'
2779                                int escEnd = escStart;
2780                                while (escEnd < len && escEnd - escStart < MAX_ESCAPE + 1 && input.charAt(escEnd) != ';')
2781                                        escEnd++;
2782
2783                                if (escEnd == len)
2784                                        break;
2785
2786                                // Bail if this is not a potential HTML entity.
2787                                if (escEnd - escStart < MIN_ESCAPE || escEnd - escStart == MAX_ESCAPE + 1) {
2788                                        escStart++;
2789                                        continue;
2790                                }
2791
2792                                // Check the kind of entity
2793                                if (input.charAt(escStart) == '#') {
2794                                        // Numeric entity
2795                                        int numStart = escStart + 1;
2796                                        int radix;
2797
2798                                        char firstChar = input.charAt(numStart);
2799                                        if (firstChar == 'x' || firstChar == 'X') {
2800                                                numStart++;
2801                                                radix = 16;
2802                                        } else {
2803                                                radix = 10;
2804                                        }
2805
2806                                        try {
2807                                                int entityValue = Integer.parseInt(input.substring(numStart, escEnd), radix);
2808
2809                                                if (result == null)
2810                                                        result = new StringBuilder(input.length());
2811
2812                                                result.append(input, start, escStart - 1);
2813
2814                                                if (entityValue > 0xFFFF)
2815                                                        result.append(Character.toChars(entityValue));
2816                                                else
2817                                                        result.append((char) entityValue);
2818                                        } catch (NumberFormatException ignored) {
2819                                                escStart++;
2820                                                continue;
2821                                        }
2822                                } else {
2823                                        // Named entity
2824                                        int[] codePoints = LOOKUP_MAP.get(input.substring(escStart, escEnd));
2825                                        if (codePoints == null) {
2826                                                escStart++;
2827                                                continue;
2828                                        }
2829
2830                                        if (result == null)
2831                                                result = new StringBuilder(input.length());
2832
2833                                        result.append(input, start, escStart - 1);
2834                                        for (int codePoint : codePoints)
2835                                                result.appendCodePoint(codePoint);
2836                                }
2837
2838                                // Skip escape
2839                                start = escEnd + 1;
2840                                escStart = start;
2841                        }
2842
2843                        if (result != null) {
2844                                result.append(input, start, len);
2845                                return result.toString();
2846                        }
2847
2848                        return input;
2849                }
2850        }
2851
2852        // *** END HTML-Unescaper source ***
2853}