背景:在解读properties配置文件时windows操作系统编辑过的内容上传后总是无法通过键获取文件中内容,讲过分析是文件的编码格式为UTF-8带BOM的,因此通过该程序获取文件编码格式
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.BitSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class EncodeUtil {
private static Logger logger = LoggerFactory.getLogger(EncodeUtil.class);
private static int BYTE_SIZE = 8;
public static String CODE_UTF8 = "UTF-8";
public static String CODE_UTF8_BOM = "UTF-8_BOM";
public static String CODE_GBK = "GBK";
public static String getEncode(String fullFileName, boolean ignoreBom) throws Exception {
logger.debug("fullFileName ; {}", fullFileName);
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName));
return getEncode(bis, ignoreBom);
}
public static String getEncode(BufferedInputStream bis, boolean ignoreBom) throws Exception {
bis.mark(0);
String encodeType = "未识别";
byte[] head = new byte[3];
bis.read(head);
if (head[0] == -1 && head[1] == -2) {
encodeType = "UTF-16";
} else if (head[0] == -2 && head[1] == -1) {
encodeType = "Unicode";
} else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {
if (ignoreBom) {
encodeType = CODE_UTF8;
} else {
encodeType = CODE_UTF8_BOM;
}
} else if ("Unicode".equals(encodeType)) {
encodeType = "UTF-16";
} else if (isUTF8(bis)) {
encodeType = CODE_UTF8;
} else {
encodeType = CODE_GBK;
}
logger.info("result encode type : " + encodeType);
return encodeType;
}
private static boolean isUTF8( BufferedInputStream bis) throws Exception {
bis.reset();
int code = bis.read();
do {
BitSet bitSet = convert2BitSet(code);
if (bitSet.get(0)) {
if (!checkMultiByte(bis, bitSet)) {
return false;
}
} else {
}
code = bis.read();
} while (code != -1);
return true;
}
private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
int count = getCountOfSequential(bitSet);
byte[] bytes = new byte[count - 1];
bis.read(bytes);
for (byte b : bytes) {
if (!checkUtf8Byte(b)) {
return false;
}
}
return true;
}
private static boolean checkUtf8Byte(byte b) throws Exception {
BitSet bitSet = convert2BitSet(b);
return bitSet.get(0) && !bitSet.get(1);
}
private static int getCountOfSequential( BitSet bitSet) {
int count = 0;
for (int i = 0; i < BYTE_SIZE; i++) {
if (bitSet.get(i)) {
count++;
} else {
break;
}
}
return count;
}
private static BitSet convert2BitSet(int code) {
BitSet bitSet = new BitSet(BYTE_SIZE);
for (int i = 0; i < BYTE_SIZE; i++) {
int tmp3 = code >> (BYTE_SIZE - i - 1);
int tmp2 = 0x1 & tmp3;
if (tmp2 == 1) {
bitSet.set(i);
}
}
return bitSet;
}
public static void convert(String oldFullFileName, String oldCharsetName, String newFullFileName, String newCharsetName) throws Exception {
logger.info("the old file name is : {}, The oldCharsetName is : {}", oldFullFileName, oldCharsetName);
logger.info("the new file name is : {}, The newCharsetName is : {}", newFullFileName, newCharsetName);
StringBuffer content = new StringBuffer();
BufferedReader bin = new BufferedReader(new InputStreamReader(new FileInputStream(oldFullFileName), oldCharsetName));
String line;
while ((line = bin.readLine()) != null) {
content.append(line);
content.append(System.getProperty("line.separator"));
}
newFullFileName = newFullFileName.replace("\\", "/");
File dir = new File(newFullFileName.substring(0, newFullFileName.lastIndexOf("/")));
if (!dir.exists()) {
dir.mkdirs();
}
Writer out = new OutputStreamWriter(new FileOutputStream(newFullFileName), newCharsetName);
out.write(content.toString());
}
}
调用方式
public static void main(String[] args) {
try {
String filePath ="/home/zhanghy/文档/国寿/实时计算paas/ee/error/分/config/function_list.properties";
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filePath));
String code = EncodeUtil.getEncode(bis, false);
System.out.println("文件编码格式为:"+code);
} catch (Exception e) {
e.printStackTrace();
}
}