Friday, July 31, 2009

UTF-8 encoding problems?

Use this code to find the location of a wayward non-UTF-8 character:

import java.io.File;
import java.io.FileInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;

public class CheckEncoding {
public static void main(String[] args) {
FileInputStream fis;
try {
fis = new FileInputStream(new File(args[0]));
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
byte[] buff = new byte[1];
int i = 0;
while ((i = fis.read(buff)) > 0) {
System.out.print(decoder.decode(ByteBuffer.wrap(buff, 0, i))
.array());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}


Use this class to automagically scrub out unwanted bad characters:

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;

public class UTF8Scrubber extends InputStream {

CharsetDecoder decoder = null;
CharsetEncoder encoder = null;
private InputStream is;

public UTF8Scrubber(InputStream is) {
super();
this.is = is;
decoder = Charset.forName("UTF-8").newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.replaceWith(" ");
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
encoder = Charset.forName("UTF-8").newEncoder();
}



@Override
public int read() throws IOException {
byte[] b = new byte[1];
if(is.read(b) == 1)
return encoder.encode(decoder.decode(ByteBuffer.wrap(b))).array()[0];
return -1;
}

public static void main(String[] args) throws IOException {
byte[] buff = new byte[1024];
int i = 0;
InputStream is = new UTF8Scrubber(new FileInputStream(new File(args[0])));
while((i=is.read(buff))>0) {
System.out.print(new String(buff,0,i));
}
}

}

No comments:

Post a Comment