java判断文本文件编码格式


时间:2021-02-08 05:59:33




/*** 判断文件的编码格式* @param fileName :file* @return 文件编码格式* @throws Exception*/public static String codeString(File fileName) throws Exception{BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));int p = (bin.read() << 8) + bin.read();String code = null;switch (p) {case 0xefbb:code = "UTF-8";break;case 0xfffe:code = "Unicode";break;case 0xfeff:code = "UTF-16BE";break;default:code = "GBK";}IOUtils.closeQuietly(bin);return code;}


1、 轮询常用的编码,知道找到匹配的,如下面一段测试代码

/** Copyright Georgios Migdos <cyberpython@>.* * Licensed under the Apache License, Version 2.0 (the "License");* you may not use this file except in compliance with the License.* You may obtain a copy of the License at* * /licenses/LICENSE-2.0* * Unless required by applicable law or agreed to in writing, software* distributed under the License is distributed on an "AS IS" BASIS,* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.* See the License for the specific language governing permissions and* limitations under the License.* under the License.*/import java.io.BufferedInputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import java.nio.ByteBuffer;import java.nio.charset.CharacterCodingException;import java.nio.charset.Charset;import java.nio.charset.CharsetDecoder;/**** @author Georgios Migdos <cyberpython@>*/public class CharsetDetector {public Charset detectCharset(File f, String[] charsets) {Charset charset = null;for (String charsetName : charsets) {charset = detectCharset(f, Charset.forName(charsetName));if (charset != null) {break;}}return charset;}private Charset detectCharset(File f, Charset charset) {try {BufferedInputStream input = new BufferedInputStream(new FileInputStream(f));CharsetDecoder decoder = charset.newDecoder();decoder.reset();byte[] buffer = new byte[512];boolean identified = false;while ((input.read(buffer) != -1) && (!identified)) {identified = identify(buffer, decoder);}input.close();if (identified) {return charset;} else {return null;}} catch (Exception e) {return null;}}private boolean identify(byte[] bytes, CharsetDecoder decoder) {try {decoder.decode(ByteBuffer.wrap(bytes));} catch (CharacterCodingException e) {return false;}return true;}public static void main(String[] args) {File f = new File("example.txt");String[] charsetsToBeTested = {"UTF-8", "windows-1253", "ISO-8859-7"};CharsetDetector cd = new CharsetDetector();Charset charset = cd.detectCharset(f, charsetsToBeTested);if (charset != null) {try {InputStreamReader reader = new InputStreamReader(new FileInputStream(f), charset);int c = 0;while ((c = reader.read()) != -1) {System.out.print((char)c);}reader.close();} catch (FileNotFoundException fnfe) {fnfe.printStackTrace();}catch(IOException ioe){ioe.printStackTrace();}}else{System.out.println("Unrecognized charset.");}}}

2、 使用谷歌依赖库来进行判断

