主要是在读取文件的时候指定文件的编码,获取文件编码示例如下:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
public class FileCharsetUtils {
public static Logger LOG = LoggerFactory.getLogger(FileCharsetUtils.class);
public static String getCharset(String filePath) {
String charset = null;
try {
charset = getCharset(new FileInputStream(filePath));
} catch (FileNotFoundException e) {
LOG.error(e.getMessage(), e);
}
LOG.info("文件[" + filePath + "] 采用的字符集为: [" + charset + "]");
return charset;
}
public static String getCharset(InputStream inputStream) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
BufferedInputStream bis = new BufferedInputStream(inputStream);
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1) {
bis.close();
return charset; // 文件编码为 ANSI
} else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE"; // 文件编码为 Unicode
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE"; // 文件编码为 Unicode big endian
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8"; // 文件编码为 UTF-8
checked = true;
}
bis.reset();
if (!checked) {
while ((read = bis.read()) != -1) {
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
// (0x80 - 0xBF),也可能在GB编码内
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) { // 也有可能出错,但是几率较小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
bis.close();
} catch (Exception e) {
LOG.error(e.getMessage(), e);
} finally {
try {
if (null != inputStream) inputStream.close();
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
}
LOG.info("文件采用的字符集为: [" + charset + "]");
return charset;
}
public static String getCodeString(String fileName) throws Exception {
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fileName));
int i = (bis.read() << 8) + bis.read();
bis.close();
String code = null;
switch (i) {
case 0xefbb:
code = "UTF-8";
break;
case 0xfffe:
code = "Unicode";
break;
case 0xfeff:
code = "UTF-16BE";
break;
default:
code = "GBK";
}
return code;
}
}
读取文件的时候指定编码
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class CsvFileUtils {
public static Logger LOG = LoggerFactory.getLogger(CsvFileUtils.class);
public static <T> void read(String filePath, LineHandler lineHandler, Consumer<List<T>> consumer,
int threshold, int ignoreLineCount) {
try {
read(new FileInputStream(new File(filePath)), FileCharsetUtils.getCharset(filePath),
lineHandler, consumer, threshold, ignoreLineCount);
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
public static <T> void read(InputStream in, LineHandler lineHandler, Consumer<List<T>> consumer,
int threshold, int ignoreLineCount) {
read(in, null, lineHandler, consumer, threshold, ignoreLineCount);
}
public static <T> void read(InputStream in, String charsetName, LineHandler lineHandler,
Consumer<List<T>> consumer, int threshold, int ignoreLineCount) {
List<T> ts = new ArrayList<T>();
InputStream in1 = null, in2 = null;
BufferedReader br = null;
try {
if (StringUtils.isBlank(charsetName)) {
ByteArrayOutputStream baos = cloneInputStream(in);
in1 = new ByteArrayInputStream(baos.toByteArray());
in2 = new ByteArrayInputStream(baos.toByteArray());
br = new BufferedReader(new InputStreamReader(in2, FileCharsetUtils.getCharset(in1)));
} else {
br = new BufferedReader(new InputStreamReader(in, charsetName));
}
for (int i = 0; i < ignoreLineCount; i++)
br.readLine();
String line = null;
while ((line = br.readLine()) != null) {
ts.add(lineHandler.handle(line));
if (ts.size() > threshold) {
consumer.accept(ts);
ts.clear();
}
}
if (ts.size() > 0) {
consumer.accept(ts);
ts.clear();
}
} catch (Exception e) {
LOG.error(e.getMessage(), e);
} finally {
try {
if (null != in) in.close();
if (null != in1) in1.close();
if (null != in2) in2.close();
if (null != br) br.close();
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
}
private static ByteArrayOutputStream cloneInputStream(InputStream inputStream) {
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = inputStream.read(buffer)) > -1) {
baos.write(buffer, 0, len);
}
baos.flush();
return baos;
} catch (IOException e) {
LOG.error(e.getMessage(), e);
return null;
}
}
}
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/34439.html