Java学习笔记-CSV文件的乱码处理

主要是在读取文件的时候指定文件的编码，获取文件编码示例如下：

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;

public class FileCharsetUtils {

	public static Logger LOG = LoggerFactory.getLogger(FileCharsetUtils.class);

	public static String getCharset(String filePath) {
		String charset = null;
		try {
			charset = getCharset(new FileInputStream(filePath));
		} catch (FileNotFoundException e) {
			LOG.error(e.getMessage(), e);
		}
		LOG.info("文件[" + filePath + "] 采用的字符集为: [" + charset + "]");
		return charset;
	}

	public static String getCharset(InputStream inputStream) {
		String charset = "GBK";
		byte[] first3Bytes = new byte[3];
		try {
			boolean checked = false;
			BufferedInputStream bis = new BufferedInputStream(inputStream);
			bis.mark(0);
			int read = bis.read(first3Bytes, 0, 3);
			if (read == -1) {
				bis.close();
				return charset; // 文件编码为 ANSI
			} else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
				charset = "UTF-16LE"; // 文件编码为 Unicode
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
				charset = "UTF-16BE"; // 文件编码为 Unicode big endian
				checked = true;
			} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB
				&& first3Bytes[2] == (byte) 0xBF) {
				charset = "UTF-8"; // 文件编码为 UTF-8
				checked = true;
			}
			bis.reset();
			if (!checked) {
				while ((read = bis.read()) != -1) {
					if (read >= 0xF0)
						break;
					if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
						break;
					if (0xC0 <= read && read <= 0xDF) {
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
							// (0x80 - 0xBF),也可能在GB编码内
							continue;
						else
							break;
					} else if (0xE0 <= read && read <= 0xEF) { // 也有可能出错，但是几率较小
						read = bis.read();
						if (0x80 <= read && read <= 0xBF) {
							read = bis.read();
							if (0x80 <= read && read <= 0xBF) {
								charset = "UTF-8";
								break;
							} else
								break;
						} else
							break;
					}
				}
			}
			bis.close();
		} catch (Exception e) {
			LOG.error(e.getMessage(), e);
		} finally {
			try {
				if (null != inputStream) inputStream.close();
			} catch (IOException e) {
				LOG.error(e.getMessage(), e);
			}
		}
		LOG.info("文件采用的字符集为: [" + charset + "]");
		return charset;
	}

	public static String getCodeString(String fileName) throws Exception {
		BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fileName));
		int i = (bis.read() << 8) + bis.read();
		bis.close();
		String code = null;
		switch (i) {
			case 0xefbb:
				code = "UTF-8";
				break;
			case 0xfffe:
				code = "Unicode";
				break;
			case 0xfeff:
				code = "UTF-16BE";
				break;
			default:
				code = "GBK";
		}
		return code;
	}

}

读取文件的时候指定编码

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;

public class CsvFileUtils {

	public static Logger LOG = LoggerFactory.getLogger(CsvFileUtils.class);

	public static <T> void read(String filePath, LineHandler lineHandler, Consumer<List<T>> consumer,
			int threshold, int ignoreLineCount) {
		try {
			read(new FileInputStream(new File(filePath)), FileCharsetUtils.getCharset(filePath),
				lineHandler, consumer, threshold, ignoreLineCount);
		} catch (Exception e) {
			LOG.error(e.getMessage(), e);
		}
	}

	public static <T> void read(InputStream in, LineHandler lineHandler, Consumer<List<T>> consumer,
			int threshold, int ignoreLineCount) {
		read(in, null, lineHandler, consumer, threshold, ignoreLineCount);
	}

	public static <T> void read(InputStream in, String charsetName, LineHandler lineHandler,
			Consumer<List<T>> consumer, int threshold, int ignoreLineCount) {
		List<T> ts = new ArrayList<T>();
		InputStream in1 = null, in2 = null;
		BufferedReader br = null;
		try {
			if (StringUtils.isBlank(charsetName)) {
				ByteArrayOutputStream baos = cloneInputStream(in);
				in1 = new ByteArrayInputStream(baos.toByteArray());
				in2 = new ByteArrayInputStream(baos.toByteArray());
				br = new BufferedReader(new InputStreamReader(in2, FileCharsetUtils.getCharset(in1)));
			} else {
				br = new BufferedReader(new InputStreamReader(in, charsetName));
			}
			for (int i = 0; i < ignoreLineCount; i++)
				br.readLine();
			String line = null;
			while ((line = br.readLine()) != null) {
				ts.add(lineHandler.handle(line));
				if (ts.size() > threshold) {
					consumer.accept(ts);
					ts.clear();
				}
			}
			if (ts.size() > 0) {
				consumer.accept(ts);
				ts.clear();
			}
		} catch (Exception e) {
			LOG.error(e.getMessage(), e);
		} finally {
			try {
				if (null != in) in.close();
				if (null != in1) in1.close();
				if (null != in2) in2.close();
				if (null != br) br.close();
			} catch (Exception e) {
				LOG.error(e.getMessage(), e);
			}
		}
	}

	private static ByteArrayOutputStream cloneInputStream(InputStream inputStream) {
		try {
			ByteArrayOutputStream baos = new ByteArrayOutputStream();
			byte[] buffer = new byte[1024];
			int len;
			while ((len = inputStream.read(buffer)) > -1) {
				baos.write(buffer, 0, len);
			}
			baos.flush();
			return baos;
		} catch (IOException e) {
			LOG.error(e.getMessage(), e);
			return null;
		}
	}

}

版权声明：本文内容由互联网用户自发贡献，该文观点仅代表作者本人。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容，请发送邮件至举报，一经查实，本站将立刻删除。
如需转载请保留出处：https://bianchenghao.cn/34439.html

Java学习笔记-CSV文件的乱码处理

相关推荐

发表回复