You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ntool/nstr/codec.go

85 lines
1.9 KiB
Go

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package nstr
import (
"golang.org/x/text/encoding/simplifiedchinese"
"unicode/utf8"
)
const (
GBK string = "GBK"
UTF8 string = "UTF8"
UNKNOWN string = "UNKNOWN"
)
// Charset 获取字符的编码类型
// 需要说明的是IsGBK()是通过双字节是否落在gbk的编码范围内实现的
// 而utf-8编码格式的每个字节都是落在gbk的编码范围内
// 所以只有先调用utf8.Valid() 先判断不是utf-8编码再调用IsGBK()才有意义
func Charset(data []byte) string {
if utf8.Valid(data) {
return UTF8
} else if IsGBK(data) {
return GBK
} else {
return UNKNOWN
}
}
// IsGBK 判断字符是否是 GBK 编码
// 需要说明的是IsGBK()是通过双字节是否落在gbk的编码范围内实现的
// 而utf-8编码格式的每个字节都是落在gbk的编码范围内
// 所以只有先调用utf8.Valid() 先判断不是utf-8编码再调用IsGBK()才有意义
//
// usage
// data := []byte("你好")
// if utf8.Valid(data) {
// fmt.Println("data encoding is utf-8")
// }else if(IsGBK(data)) {
// fmt.Println("data encoding is GBK")
// }
func IsGBK(data []byte) bool {
length := len(data)
i := 0
for i < length {
if data[i] <= 0x7f {
// 编码0~127,只有一个字节的编码兼容ASCII码
i++
continue
} else {
//大于127的使用双字节编码落在gbk编码范围内的字符
if data[i] >= 0x81 &&
data[i] <= 0xfe &&
data[i+1] >= 0x40 &&
data[i+1] <= 0xfe &&
data[i+1] != 0xf7 {
i += 2
continue
} else {
return false
}
}
}
return true
}
func ToGBK(data []byte) ([]byte, error) {
transBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes(data)
if err != nil {
return data, err
}
return transBytes, nil
}
func ToGBKStrSafe(data []byte) string {
if utf8.Valid(data) {
return string(data)
} else if IsGBK(data) {
gbkBytes, err := ToGBK(data)
if err != nil {
return ""
}
return string(gbkBytes)
}
return string(data)
}