Reject invalid UTF-8 instead of ignoring it.

2026-02-20 22:59:54 +01:00 · 2023-02-18 17:10:57 +00:00
parent 10b1fe756d
commit 5a09530670
3 changed files with 52 additions and 28 deletions
--- a/bramble-api/src/main/java/org/briarproject/bramble/util/StringUtils.java
+++ b/bramble-api/src/main/java/org/briarproject/bramble/util/StringUtils.java
@@ -14,6 +14,7 @@ import java.util.regex.Pattern;
 import javax.annotation.Nullable;
 import static java.nio.charset.CodingErrorAction.IGNORE;
 import static java.nio.charset.CodingErrorAction.REPORT;
 import static java.util.regex.Pattern.CASE_INSENSITIVE;
@SuppressWarnings("CharsetObjectCanBeUsed")
@@ -52,26 +53,38 @@ public class StringUtils {
 		return s.getBytes(UTF_8);
 	}
-	public static String fromUtf8(byte[] bytes) {
+	public static String fromUtf8(byte[] bytes) throws FormatException {
-		return fromUtf8(bytes, 0, bytes.length);
+		return fromUtf8(bytes, 0, bytes.length, true);
 	}
-	public static String fromUtf8(byte[] bytes, int off, int len) {
+	public static String fromUtf8(byte[] bytes, int off, int len)
 			throws FormatException {
 		return fromUtf8(bytes, off, len, true);
 	}
 	private static String fromUtf8(byte[] bytes, int off, int len,
 			boolean strict) throws FormatException {
 		CharsetDecoder decoder = UTF_8.newDecoder();
-		decoder.onMalformedInput(IGNORE);
+		decoder.onMalformedInput(strict ? REPORT : IGNORE);
-		decoder.onUnmappableCharacter(IGNORE);
+		decoder.onUnmappableCharacter(strict ? REPORT : IGNORE);
 		ByteBuffer buffer = ByteBuffer.wrap(bytes, off, len);
 		try {
 			return decoder.decode(buffer).toString();
 		} catch (CharacterCodingException e) {
-			throw new AssertionError(e);
+			throw new FormatException();
 		}
 	}
 	public static String truncateUtf8(String s, int maxUtf8Length) {
 		byte[] utf8 = toUtf8(s);
 		if (utf8.length <= maxUtf8Length) return s;
-		return fromUtf8(utf8, 0, maxUtf8Length);
+		// Don't be strict when converting back, so that if we truncate a
 		// multi-byte character the whole character gets dropped
 		try {
 			return fromUtf8(utf8, 0, maxUtf8Length, false);
 		} catch (FormatException e) {
 			throw new AssertionError(e);
 		}
 	}
 	/**
--- a/bramble-core/src/test/java/org/briarproject/bramble/data/BdfReaderImplFuzzingTest.java
+++ b/bramble-core/src/test/java/org/briarproject/bramble/data/BdfReaderImplFuzzingTest.java
@@ -1,5 +1,6 @@
 package org.briarproject.bramble.data;
 import org.briarproject.bramble.api.FormatException;
 import org.briarproject.bramble.test.BrambleTestCase;
 import org.junit.Before;
 import org.junit.Test;
@@ -32,10 +33,13 @@ public class BdfReaderImplFuzzingTest extends BrambleTestCase {
 			in.reset();
 			BdfReaderImpl r = new BdfReaderImpl(in, DEFAULT_NESTED_LIMIT,
 					DEFAULT_MAX_BUFFER_SIZE);
-			int length = r.readString().length();
+			try {
-			assertTrue(length >= 0);
+				int length = r.readString().length();
-			assertTrue(length <= 20);
+				assertTrue(length <= 20);
-			assertTrue(r.eof());
+				assertTrue(r.eof());
 			} catch (FormatException e) {
 				// Expected when bytes are not valid UTF-8
 			}
 		}
 	}
 }
--- a/bramble-core/src/test/java/org/briarproject/bramble/util/StringUtilsTest.java
+++ b/bramble-core/src/test/java/org/briarproject/bramble/util/StringUtilsTest.java
@@ -88,51 +88,58 @@ public class StringUtilsTest extends BrambleTestCase {
 	}
 	@Test
-	public void testFromUtf8AcceptsNullCharacterUsingStandardUtf8() {
+	public void testFromUtf8AcceptsNullCharacterUsingStandardUtf8()
 			throws Exception {
 		// The UTF-8 encoding of the null character is valid
-		assertEquals("\u0000", StringUtils.fromUtf8(new byte[1]));
+		byte[] utf8 = new byte[1];
 		String actual = StringUtils.fromUtf8(utf8);
 		assertEquals("\u0000", actual);
 		// When we convert back to UTF-8 we should get the original encoding
 		assertArrayEquals(utf8, StringUtils.toUtf8(actual));
 	}
-	@Test
+	@Test(expected = FormatException.class)
-	public void testFromUtf8RemovesNullCharacterUsingModifiedUtf8() {
+	public void testFromUtf8RejectsNullCharacterUsingModifiedUtf8()
 			throws Exception {
 		// The modified UTF-8 encoding of the null character is not valid
 		byte[] b = new byte[] {
 				(byte) 0xC0, (byte) 0x80, // Null character as modified UTF-8
 				(byte) 0xC8, (byte) 0x85 // U+0205
 		};
-		// Conversion should ignore the invalid character and return the rest
+		StringUtils.fromUtf8(b);
 		String expected = "\u0205";
 		assertEquals(expected, StringUtils.fromUtf8(b));
 	}
 	@Test
-	public void testFromUtf8AcceptsSupplementaryCharacterUsingStandardUtf8() {
+	public void testFromUtf8AcceptsSupplementaryCharacterUsingStandardUtf8()
 			throws Exception {
 		// The UTF-8 encoding of a supplementary character is valid and should
 		// be converted to a surrogate pair
-		byte[] b = new byte[] {
+		byte[] utf8 = new byte[] {
 				(byte) 0xF0, (byte) 0x90, (byte) 0x90, (byte) 0x80, // U+10400
 				(byte) 0xC8, (byte) 0x85 // U+0205
 		};
 		String expected = "\uD801\uDC00\u0205"; // Surrogate pair
-		assertEquals(expected, StringUtils.fromUtf8(b));
+		String actual = StringUtils.fromUtf8(utf8);
 		assertEquals(expected, actual);
 		// When we convert back to UTF-8 we should get the original encoding
 		assertArrayEquals(utf8, StringUtils.toUtf8(actual));
 	}
-	@Test
+	@Test(expected = FormatException.class)
-	public void testFromUtf8RemovesSupplementaryCharacterUsingModifiedUtf8() {
+	public void testFromUtf8RejectsSupplementaryCharacterUsingModifiedUtf8()
 			throws Exception {
 		// The CESU-8 or modified UTF-8 encoding of a supplementary character
 		// is not valid
-		byte[] b = new byte[] {
+		byte[] utf8 = new byte[] {
 				(byte) 0xED, (byte) 0xA0, (byte) 0x81, // U+10400 as CSEU-8
 				(byte) 0xED, (byte) 0xB0, (byte) 0x80,
 				(byte) 0xC8, (byte) 0x85 // U+0205
 		};
-		// Conversion should ignore the invalid character and return the rest
+		StringUtils.fromUtf8(utf8);
 		String expected = "\u0205";
 		assertEquals(expected, StringUtils.fromUtf8(b));
 	}
 	@Test
-	public void testFromUtf8EmptyInput() {
+	public void testFromUtf8EmptyInput() throws Exception {
 		assertEquals("", StringUtils.fromUtf8(new byte[0]));
 	}