在Tomcat中自动检测URI编码

一尘不染

在Tomcat中自动检测URI编码

tomcat

我有一个正在运行的Apache Tomcat 6.x实例，并且我希望它能比默认行为更智能地解释传入URL的字符集。特别是，我想实现以下映射：

So%DFe => Soße
So%C3%9Fe => Soße
So%DF%C3%9F => (error)

我想要的行为可以描述为“尝试将字节流解码为UTF-8，如果不起作用，则假定为ISO-8859-1”。

URIEncoding在这种情况下，仅使用配置无效。那么，如何配置Tomcat以所需的方式对请求进行编码？

我可能必须编写一个接受请求（特别是查询字符串）并将其重新编码为参数的过滤器。那是自然的方法吗？

阅读 219

2020-06-16

共1个答案

一尘不染

实现我的目标的复杂方法确实是编写我自己的javax.servlet.Filter并将其嵌入过滤器链中。该解决方案符合Tomcat Wiki-
字符编码问题中提供的Apache
Tomcat建议。

更新（2010-07-31）：
此过滤器的第一个版本解释了查询字符串本身，这是一个坏主意。POST与其他servlet过滤器（如URL重写）结合使用时，它不能正确处理请求，并且出现问题。该版本将包装最初提供的参数并重新编码。为了使其正常工作，URIEncoding（例如在Tomcat中）必须将其配置为ISO-8859-1。

package de.roland_illig.webapps.webapp1;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletRequestWrapper;
import javax.servlet.http.HttpServletResponse;

/**
 * Automatically determines the encoding of the request parameters. It assumes
 * that the parameters of the original request are encoded by a 1:1 mapping from
 * bytes to characters.
 * <p>
 * If the request parameters cannot be decoded by any of the given encodings,
 * the filter chain is not processed further, but a status code of 400 with a
 * helpful error message is returned instead.
 * <p>
 * The filter can be configured using the following parameters:
 * <ul>
 * <li>{@code encodings}: The comma-separated list of encodings (see
 * {@link Charset#forName(String)}) that are tried in order. The first one that
 * can decode the complete query string is taken.
 * <p>
 * Default value: {@code UTF-8}
 * <p>
 * Example: {@code UTF-8,EUC-KR,ISO-8859-15}.
 * <li>{@code inputEncodingParameterName}: When this parameter is defined and a
 * query parameter of that name is provided by the client, and that parameter's
 * value contains only non-escaped characters and the server knows an encoding
 * of that name, then it is used exclusively, overriding the {@code encodings}
 * parameter for this request.
 * <p>
 * Default value: {@code null}
 * <p>
 * Example: {@code ie} (as used by Google).
 * </ul>
 */
public class EncodingFilter implements Filter {

  private static final Pattern PAT_COMMA = Pattern.compile(",\\s*");

  private String inputEncodingParameterName = null;
  private final List<Charset> encodings = new ArrayList<Charset>();

  @Override
  @SuppressWarnings("unchecked")
  public void init(FilterConfig config) throws ServletException {
    String encodingsStr = "UTF-8";

    Enumeration<String> en = config.getInitParameterNames();
    while (en.hasMoreElements()) {
      final String name = en.nextElement();
      final String value = config.getInitParameter(name);
      if (name.equals("encodings")) {
        encodingsStr = value;
      } else if (name.equals("inputEncodingParameterName")) {
        inputEncodingParameterName = value;
      } else {
        throw new IllegalArgumentException("Unknown parameter: " + name);
      }
    }

    for (String encoding : PAT_COMMA.split(encodingsStr)) {
      Charset charset = Charset.forName(encoding);
      encodings.add(charset);
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public void doFilter(ServletRequest sreq, ServletResponse sres, FilterChain fc) throws IOException, ServletException {
    final HttpServletRequest req = (HttpServletRequest) sreq;
    final HttpServletResponse res = (HttpServletResponse) sres;

    final Map<String, String[]> params;
    try {
      params = Util.decodeParameters(req.getParameterMap(), encodings, inputEncodingParameterName);
    } catch (IOException e) {
      res.sendError(400, e.getMessage());
      return;
    }

    HttpServletRequest wrapper = new ParametersWrapper(req, params);
    fc.doFilter(wrapper, res);
  }

  @Override
  public void destroy() {
    // nothing to do
  }

  static abstract class Util {

    static CharsetDecoder strictDecoder(Charset cs) {
      CharsetDecoder dec = cs.newDecoder();
      dec.onMalformedInput(CodingErrorAction.REPORT);
      dec.onUnmappableCharacter(CodingErrorAction.REPORT);
      return dec;
    }

    static int[] toCodePoints(String str) {
      final int len = str.length();
      int[] codePoints = new int[len];
      int i = 0, j = 0;
      while (i < len) {
        int cp = Character.codePointAt(str, i);
        codePoints[j++] = cp;
        i += Character.charCount(cp);
      }
      return j == len ? codePoints : Arrays.copyOf(codePoints, len);
    }

    public static String recode(String encoded, CharsetDecoder decoder) throws IOException {
      byte[] bytes = new byte[encoded.length()];
      int bytescount = 0;

      for (int i = 0; i < encoded.length(); i++) {
        char c = encoded.charAt(i);
        if (!(c <= '\u00FF'))
          throw new IOException("Invalid character: #" + (int) c);
        bytes[bytescount++] = (byte) c;
      }

      CharBuffer cbuf = decoder.decode(ByteBuffer.wrap(bytes, 0, bytescount));
      String result = cbuf.toString();
      return result;
    }

    static String ensureDefinedUnicode(String s) throws IOException {
      for (int cp : toCodePoints(s)) {
        if (!Character.isDefined(cp))
          throw new IOException("Undefined unicode code point: " + cp);
      }
      return s;
    }

    static Map<String, String[]> decodeParameters(Map<String, String[]> originalParams, List<Charset> charsets, String ieName) throws IOException {
      Map<String, String[]> params = new LinkedHashMap<String, String[]>();

      Charset ie = null;
      {
        String[] values = originalParams.get(ieName);
        if (values != null) {
          for (String value : values) {
            if (!value.isEmpty() && value.indexOf('%') == -1) {
              try {
                if (ie != null)
                  throw new IOException("Duplicate value for input encoding parameter: " + ie + " and " + value + ".");
                ie = Charset.forName(value);
              } catch (IllegalCharsetNameException e) {
                throw new IOException("Illegal input encoding name: " + value);
              } catch (UnsupportedCharsetException e) {
                throw new IOException("Unsupported input encoding: " + value);
              }
            }
          }
        }
      }

      Charset[] css = (ie != null) ? new Charset[] { ie } : charsets.toArray(new Charset[charsets.size()]);
      for (Charset charset : css) {
        try {
          params.clear();
          CharsetDecoder decoder = strictDecoder(charset);
          for (Map.Entry<String, String[]> entry : originalParams.entrySet()) {
            final String encodedName = entry.getKey();
            final String name = ensureDefinedUnicode(Util.recode(encodedName, decoder));
            for (final String encodedValue : entry.getValue()) {
              final String value = ensureDefinedUnicode(Util.recode(encodedValue, decoder));
              String[] oldValues = params.get(name);
              String[] newValues = (oldValues == null) ? new String[1] : Arrays.copyOf(oldValues, oldValues.length + 1);
              newValues[newValues.length - 1] = value;
              params.put(name, newValues);
            }
          }
          return params;
        } catch (IOException e) {
          continue;
        }
      }

      List<String> kvs = new ArrayList<String>();
      for (Map.Entry<String, String[]> entry : originalParams.entrySet()) {
        final String key = entry.getKey();
        for (final String value : entry.getValue()) {
          kvs.add(key + "=" + value);
        }
      }
      throw new IOException("Could not decode the parameters: " + kvs.toString());
    }
  }

  @SuppressWarnings("unchecked")
  static class ParametersWrapper extends HttpServletRequestWrapper {

    private final Map<String, String[]> params;

    public ParametersWrapper(HttpServletRequest request, Map<String, String[]> params) {
      super(request);
      this.params = params;
    }

    @Override
    public String getParameter(String name) {
      String[] values = params.get(name);
      return (values != null && values.length != 0) ? values[0] : null;
    }

    @Override
    public Map getParameterMap() {
      return Collections.unmodifiableMap(params);
    }

    @Override
    public Enumeration getParameterNames() {
      return Collections.enumeration(params.keySet());
    }

    @Override
    public String[] getParameterValues(String name) {
      return params.get(name);
    }
  }
}

尽管代码的大小相当小，但是有些实现细节可能会出错，因此我希望Tomcat已经提供了类似的过滤器。

要激活此过滤器，我将以下内容添加到了我的web.xml：

<filter>
  <filter-name>EncodingFilter</filter-name>
  <filter-class>de.roland_illig.webapps.webapp1.EncodingFilter</filter-class>
  <init-param>
    <param-name>encodings</param-name>
    <param-value>US-ASCII, UTF-8, EUC-KR, ISO-8859-15, ISO-8859-1</param-value>
  </init-param>
  <init-param>
    <param-name>inputEncodingParameterName</param-name>
    <param-value>ie</param-value>
  </init-param>
</filter>

<filter-mapping>
  <filter-name>EncodingFilter</filter-name>
  <url-pattern>/*</url-pattern>
</filter-mapping>

2020-06-16