栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 面试经验 > 面试问答

如何在utlang中将utf16文本文件读取为字符串?

面试问答 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

如何在utlang中将utf16文本文件读取为字符串?

UTF16,UTF8和字节顺序标记由Unipre联合会定义:UTF-16常见问题解答,UTF-8常见问题解答和字节顺序标记(BOM)常见问题解答。


问题4802:bufio:阅读行太繁琐

从文件中读取行在Go中太麻烦了。

人们常常会因为其名称而将其吸引到bufio.Reader.ReadLine,但是它具有一个奇怪的签名,并且会返回(行[] byte,isPrefix
bool,err错误),并且需要进行大量工作。

ReadSlice和ReadString需要定界符字节,这几乎总是明显且难看的’ n’,并且还可以返回一行和一个EOF


修订:f685026a2d38

bufio:新的扫描仪界面

根据称为“扫描仪”的新类型,添加一个新的简单界面来扫描(可能是文本)数据。它有自己的内部缓冲,因此即使没有注入bufio.Reader也应该有效率。输入的格式由“拆分功能”定义,默认情况下分为几行。


go1.1beta1发布

您可以从通常的位置下载二进制和源分发版:https
:
//pre.google.com/p/go/downloads/list?q=go1.1beta1


这是一个使用Unipre规则将UTF16文本文件行转换为Go UTF8编码的字符串的程序。该代码已经过修改,以利用

bufio.Scanner
Go
1.1 中的新界面。

package mainimport (    "bufio"    "bytes"    "encoding/binary"    "fmt"    "os"    "runtime"    "unipre/utf16"    "unipre/utf8")// UTF16BytesToString converts UTF-16 enpred bytes, in big or little endian byte order,// to a UTF-8 enpred string.func UTF16BytesToString(b []byte, o binary.ByteOrder) string {    utf := make([]uint16, (len(b)+(2-1))/2)    for i := 0; i+(2-1) < len(b); i += 2 {        utf[i/2] = o.Uint16(b[i:])    }    if len(b)/2 < len(utf) {        utf[len(utf)-1] = utf8.RuneError    }    return string(utf16.Depre(utf))}// UTF-16 endian byte orderconst (    unknownEndian = iota    bigEndian    littleEndian)// dropCREndian drops a terminal r from the endian data.func dropCREndian(data []byte, t1, t2 byte) []byte {    if len(data) > 1 {        if data[len(data)-2] == t1 && data[len(data)-1] == t2 { return data[0 : len(data)-2]        }    }    return data}// dropCRBE drops a terminal r from the big endian data.func dropCRBE(data []byte) []byte {    return dropCREndian(data, 'x00', 'r')}// dropCRLE drops a terminal r from the little endian data.func dropCRLE(data []byte) []byte {    return dropCREndian(data, 'r', 'x00')}// dropCR drops a terminal r from the data.func dropCR(data []byte) ([]byte, int) {    var endian = unknownEndian    switch ld := len(data); {    case ld != len(dropCRLE(data)):        endian = littleEndian    case ld != len(dropCRBE(data)):        endian = bigEndian    }    return data, endian}// SplitFunc is a split function for a Scanner that returns each line of// text, stripped of any trailing end-of-line marker. The returned line may// be empty. The end-of-line marker is one optional carriage return followed// by one mandatory newline. In regular expression notation, it is `r?n`.// The last non-empty line of input will be returned even if it has no// newline.func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {    // Function closure variables    var endian = unknownEndian    switch byteOrder {    case binary.BigEndian:        endian = bigEndian    case binary.LittleEndian:        endian = littleEndian    }    const bom = 0xFEFF    var checkBOM bool = endian == unknownEndian    // Scanner split function    splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {        if atEOF && len(data) == 0 { return 0, nil, nil        }        if checkBOM { checkBOM = false if len(data) > 1 {     switch uint16(bom) {     case uint16(data[0])<<8 | uint16(data[1]):         endian = bigEndian         return 2, nil, nil     case uint16(data[1])<<8 | uint16(data[0]):         endian = littleEndian         return 2, nil, nil     } }        }        // Scan for newline-terminated lines.        i := 0        for { j := bytes.IndexByte(data[i:], 'n') if j < 0 {     break } i += j switch e := i % 2; e { case 1: // UTF-16BE     if endian != littleEndian {         if i > 1 {  if data[i-1] == 'x00' {      endian = bigEndian      // We have a full newline-terminated line.      return i + 1, dropCRBE(data[0 : i-1]), nil  }         }     } case 0: // UTF-16LE     if endian != bigEndian {         if i+1 < len(data) {  i++  if data[i] == 'x00' {      endian = littleEndian      // We have a full newline-terminated line.      return i + 1, dropCRLE(data[0 : i-1]), nil  }         }     } } i++        }        // If we're at EOF, we have a final, non-terminated line. Return it.        if atEOF { // drop CR. advance = len(data) switch endian { case bigEndian:     data = dropCRBE(data) case littleEndian:     data = dropCRLE(data) default:     data, endian = dropCR(data) } if endian == unknownEndian {     if runtime.GOOS == "windows" {         endian = littleEndian     } else {         endian = bigEndian     } } return advance, data, nil        }        // Request more data.        return 0, nil, nil    }    // Endian byte order function    orderFunc := func() (byteOrder binary.ByteOrder) {        switch endian {        case bigEndian: byteOrder = binary.BigEndian        case littleEndian: byteOrder = binary.LittleEndian        }        return byteOrder    }    return splitFunc, orderFunc}func main() {    file, err := os.Open("utf16.le.txt")    if err != nil {        fmt.Println(err)        os.Exit(1)    }    defer file.Close()    fmt.Println(file.Name())    rdr := bufio.NewReader(file)    scanner := bufio.NewScanner(rdr)    var bo binary.ByteOrder // unknown, infer from data    // bo = binary.LittleEndian // windows    splitFunc, orderFunc := ScanUTF16LinesFunc(bo)    scanner.Split(splitFunc)    for scanner.Scan() {        b := scanner.Bytes()        s := UTF16BytesToString(b, orderFunc())        fmt.Println(len(s), s)        fmt.Println(len(b), b)    }    fmt.Println(orderFunc())    if err := scanner.Err(); err != nil {        fmt.Println(err)    }}

输出:

utf16.le.txt15 "Hello, 世界"22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]0 0 []15 "Hello, 世界"22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]LittleEndianutf16.be.txt15 "Hello, 世界"22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]0 0 []15 "Hello, 世界"22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]BigEndian


转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/495532.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号