From ae9e157ef4e6a3708489725d4436cc15d273308f Mon Sep 17 00:00:00 2001 From: James Moger <james.moger@gitblit.com> Date: Thu, 07 Jun 2012 17:30:18 -0400 Subject: [PATCH] Try multiple encodings when working with string blobs (issue 97) --- src/com/gitblit/utils/StringUtils.java | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 42 insertions(+), 0 deletions(-) diff --git a/src/com/gitblit/utils/StringUtils.java b/src/com/gitblit/utils/StringUtils.java index 2c35724..baed5f0 100644 --- a/src/com/gitblit/utils/StringUtils.java +++ b/src/com/gitblit/utils/StringUtils.java @@ -16,13 +16,23 @@ package com.gitblit.utils; import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; import java.util.regex.PatternSyntaxException; /** @@ -550,4 +560,36 @@ // remember to append any characters to the right of a match return sb.toString(); } + + /** + * Decodes a string by trying several charsets until one does not throw a + * coding exception. Last resort is to interpret as UTF-8 with illegal + * character substitution. + * + * @param content + * @param charsets optional + * @return a string + */ + public static String decodeString(byte [] content, String... charsets) { + Set<String> sets = new LinkedHashSet<String>(); + if (!ArrayUtils.isEmpty(charsets)) { + sets.addAll(Arrays.asList(charsets)); + } + sets.addAll(Arrays.asList("UTF-8", "ISO-8859-1", Charset.defaultCharset().name())); + for (String charset : sets) { + try { + Charset cs = Charset.forName(charset); + CharsetDecoder decoder = cs.newDecoder(); + CharBuffer buffer = decoder.decode(ByteBuffer.wrap(content)); + return buffer.toString(); + } catch (CharacterCodingException e) { + // ignore and advance to the next charset + } catch (IllegalCharsetNameException e) { + // ignore illegal charset names + } catch (UnsupportedCharsetException e) { + // ignore unsupported charsets + } + } + return new String(content, Charset.forName("UTF-8")); + } } \ No newline at end of file -- Gitblit v1.9.1