Bwgang的记录##

日积月累~! 积少成多~~

转:Java:判断文件的编码

转自:http://www.cppblog.com/biao/archive/2009/11/04/100130.aspx
首先,不同编码的文本,是根据文本的前两个字节来定义其编码格式的。定义如下:

  • ANSI:        无格式定义;
  • Unicode:       前两个字节为FFFE;
  • Unicode big endian: 前两字节为FEFF; 
  • UTF-8:        前两字节为EFBB;

知道了各种编码格式的区别,写代码就容易了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
public static String get_charset( File file ) {  
        String charset = "GBK";  
        byte[] first3Bytes = new byte[3];  
        try {  
            boolean;  
            BufferedInputStream bis = new BufferedInputStream( new FileInputStream( file ) );  
            bis.mark( 0 );  
            int read = bis.read( first3Bytes, 0, 3 );  
            if ( read == -1 ) return charset;  
            if ( first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE ) {  
                charset = "UTF-16LE";  
                checked = true;  
            }  
            else if ( first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF ) {  
                charset = "UTF-16BE";  
                checked = true;  
            }  
            else if ( first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF ) {  
                charset = "UTF-8";  
                checked = true;  
            }  
            bis.reset();  
            if ( !checked ) {  
            //    int len = 0;  
                int loc = 0;  
   
                while ( (read = bis.read()) != -1 ) {  
                    loc++;  
                    if ( read >= 0xF0 ) break;  
                    if ( 0x80 <= read && read <= 0xBF ) // 单独出现BF以下的,也算是GBK  
                    break;  
                    if ( 0xC0 <= read && read <= 0xDF ) {  
                        read = bis.read();  
                        if ( 0x80 <= read && read <= 0xBF ) // 双字节 (0xC0 - 0xDF) (0x80  
                                                                        // - 0xBF),也可能在GB编码内  
                        continue;  
                        else break;  
                    }  
                    else if ( 0xE0 <= read && read <= 0xEF ) {// 也有可能出错,但是几率较小  
                        read = bis.read();  
                        if ( 0x80 <= read && read <= 0xBF ) {  
                            read = bis.read();  
                            if ( 0x80 <= read && read <= 0xBF ) {  
                                charset = "UTF-8";  
                                break;  
                            }  
                            else break;  
                        }  
                        else break;  
                    }  
                }  
                //System.out.println( loc + " " + Integer.toHexString( read ) );  
            }  
   
            bis.close();  
        } catch ( Exception e ) {  
            e.printStackTrace();  
        }  
   
        return charset;  
    }