以下面的代码为例:
File f = new File("./utf.txt"); FileInputStream in = new FileInputStream(f); // 指定读取文件时以UTF-8的格式读取 BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); String line = br.readLine(); while(line != null) { System.out.println(line); line = br.readLine(); }
实际读取时,可能发现第一个字符前会多一个问号。
原因就是BOM的3个字节被当做字符读取了,应该跳过才对。 JDK的类,没有自动跳过BOM,认为应该编程人员根据需要自己实际处理。
所以,我们自己动手,在FileInputStream in = new FileInputStream(f); 之后, 加一个in.skip(3);就行了。
UTF-16LE或UTF-16BE格式时,就是跳过两个字节了。 UTF-32LE或UTF-32BE时,是4个字节吧。 ========================================================== 还有一个办法是这个: http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt
替换一行 BufferedReader br = new BufferedReader(new UnicodeReader(in, Charset.defaultCharset().name()));
代码贴过来:
/** version: 1.1 / 2007-01-25 - changed BOM recognition ordering (longer boms first)
Original pseudocode : Thomas Weidenfeller Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html BOMs: 00 00 FE FF = UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8, FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian
Win2k Notepad: Unicode format = UTF-16LE ***/
import java.io.*;
/** * Generic unicode textreader, which will use BOM mark * to identify the encoding to be used. If BOM is not found * then use a given default or system encoding. */ public class UnicodeReader extends Reader { PushbackInputStream internalIn; InputStreamReader internalIn2 = null; String defaultEnc;
private static final int BOM_SIZE = 4;
/** * * @param in inputstream to be read * @param defaultEnc default encoding if stream does not have * BOM marker. Give NULL to use system-level default. */ UnicodeReader(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; }
public String getDefaultEncoding() { return defaultEnc; }
/** * Get stream encoding or NULL if stream is uninitialized. * Call init() or read() method to initialize it. */ public String getEncoding() { if (internalIn2 == null) return null; return internalIn2.getEncoding(); }
/** * Read-ahead four bytes and check for BOM marks. Extra bytes are * unread back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (internalIn2 != null) return;
String encoding; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length);
if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) { encoding = "UTF-32BE"; unread = n - 4; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) { encoding = "UTF-32LE"; unread = n - 4; } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF) ) { encoding = "UTF-8"; unread = n - 3; } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) { encoding = "UTF-16BE"; unread = n - 2; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } //System.out.println("read=" + n + ", unread=" + unread);
if (unread > 0) internalIn.unread(bom, (n - unread), unread);
// Use given encoding if (encoding == null) { internalIn2 = new InputStreamReader(internalIn); } else { internalIn2 = new InputStreamReader(internalIn, encoding); } }
public void close() throws IOException { init(); internalIn2.close(); }
public int read(char[] cbuf, int off, int len) throws IOException { init(); return internalIn2.read(cbuf, off, len); }
}
|