Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions bpe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package bpe

import (
"bufio"
"encoding/binary"
"io"

"github.com/sirupsen/logrus"
)

// TokenID is a numerical identitier of the subword token
type TokenID uint32

// EncodedToken is a sequence of subword tokens ids
type EncodedToken []TokenID

type rule struct {
left TokenID
right TokenID
result TokenID
}

type specialTokens struct {
unk int32
pad int32
bos int32
eos int32
}

// Model is a Byte-Pair encoding model, which supports encoding and decoding text into sequences
// of most frequent subword tokens
type Model struct {
char2id map[rune]TokenID
id2char map[TokenID]rune
rules []rule
recipe map[TokenID]EncodedToken
revRecipe map[string]TokenID
specialTokens specialTokens
}

func newModel(nRules int) *Model {
return &Model{
make(map[rune]TokenID),
make(map[TokenID]rune),
make([]rule, nRules),
make(map[TokenID]EncodedToken),
make(map[string]TokenID),
specialTokens{-1, -1, -1, -1},
}
}

// DecodeToken converts the sequence of chars' ids into the string -
// sequence of the corresponding chars
func DecodeToken(token EncodedToken, id2char map[TokenID]rune) (string, error) {
word := ""
for _, id := range token {
if char, ok := id2char[id]; ok {
word = word + string(char)
} else {
logrus.Fatalf("%d key not found in id2char", id)
}
}
return word, nil
}

func specialTokensToBin(specials specialTokens) []byte {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
bytesArray := make([]byte, 16)
binary.BigEndian.PutUint32(bytesArray, uint32(specials.unk))
binary.BigEndian.PutUint32(bytesArray[4:], uint32(specials.pad))
binary.BigEndian.PutUint32(bytesArray[8:], uint32(specials.bos))
binary.BigEndian.PutUint32(bytesArray[12:], uint32(specials.eos))
return bytesArray
}

func binToSpecialTokens(bytesArray []byte) specialTokens {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
var s specialTokens
s.unk = int32(binary.BigEndian.Uint32(bytesArray))
s.pad = int32(binary.BigEndian.Uint32(bytesArray[4:]))
s.bos = int32(binary.BigEndian.Uint32(bytesArray[8:]))
s.eos = int32(binary.BigEndian.Uint32(bytesArray[12:]))
return s
}

func ruleToBin(rule rule) []byte {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
bytesArray := make([]byte, 12)
binary.BigEndian.PutUint32(bytesArray, uint32(rule.left))
binary.BigEndian.PutUint32(bytesArray[4:], uint32(rule.right))
binary.BigEndian.PutUint32(bytesArray[8:], uint32(rule.result))
return bytesArray
}

func binToRule(bytesArray []byte) rule {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
var r rule
r.left = TokenID(binary.BigEndian.Uint32(bytesArray))
r.right = TokenID(binary.BigEndian.Uint32(bytesArray[4:]))
r.result = TokenID(binary.BigEndian.Uint32(bytesArray[8:]))
return r
}

// ReadModelFromBinary loads the BPE model from the binary dump
func ReadModelFromBinary(reader io.Reader) (*Model, error) {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
bytesReader := bufio.NewReader(reader)
buf := make([]byte, 4)
var nChars, nRules int
_, err := bytesReader.Read(buf)
if err != nil {
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
nChars = int(binary.BigEndian.Uint32(buf))
_, err = bytesReader.Read(buf)
if err != nil {
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
nRules = int(binary.BigEndian.Uint32(buf))

model := newModel(nRules)
for i := 0; i < nChars; i++ {
var char rune
var charID TokenID
_, err = bytesReader.Read(buf)
if err != nil {
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
char = rune(binary.BigEndian.Uint32(buf))
_, err = bytesReader.Read(buf)
if err != nil {
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
charID = TokenID(binary.BigEndian.Uint32(buf))
model.char2id[char] = charID
model.id2char[charID] = char
model.recipe[charID] = EncodedToken{charID}
model.revRecipe[string(char)] = charID
}
ruleBuf := make([]byte, 12)
for i := 0; i < nRules; i++ {
_, err = bytesReader.Read(ruleBuf)
if err != nil {
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
rule := binToRule(ruleBuf)
model.rules[i] = rule
model.recipe[rule.result] = append(model.recipe[rule.left], model.recipe[rule.right]...)
resultString, err := DecodeToken(model.recipe[rule.result], model.id2char)
if err != nil {
logrus.Fatal("Unexpected token id inside the rules: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return model, err
}
model.revRecipe[resultString] = rule.result
}
specialTokensBuf := make([]byte, 16)
_, err = bytesReader.Read(specialTokensBuf)
if err != nil {
logrus.Fatal("Broken input: ", err)
Comment thread
irinakhismatullina marked this conversation as resolved.
Outdated
return &Model{}, err
}
model.specialTokens = binToSpecialTokens(specialTokensBuf)
return model, nil
}
70 changes: 70 additions & 0 deletions bpe_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package bpe

import (
"bytes"
"testing"

"github.com/stretchr/testify/require"
)

func TestNewModel(t *testing.T) {
model := newModel(10)
require.Equal(t, 10, len(model.rules))
}

func TestDecodedTokenToString(t *testing.T) {
id2char := map[TokenID]rune{1: []rune("a")[0], 2: []rune("b")[0], 3: []rune("c")[0]}
word, err := DecodeToken(EncodedToken{1, 2, 1, 3, 3}, id2char)
require.NoError(t, err)
require.Equal(t, "abacc", word)
}

func TestSpecialTokensToBin(t *testing.T) {
specials := specialTokens{1, 259, 2*256*256 + 37*256 + 2, -256 * 256 * 256 * 127}
bytesArray := []byte{0, 0, 0, 1, 0, 0, 1, 3, 0, 2, 37, 2, 129, 0, 0, 0}
require.Equal(t, bytesArray, specialTokensToBin(specials))
}

func TestBinToSpecialTokens(t *testing.T) {
bytesArray := []byte{0, 0, 0, 1, 0, 0, 1, 3, 0, 2, 37, 2, 129, 0, 0, 0}
Comment thread
irinakhismatullina marked this conversation as resolved.
specials := specialTokens{1, 259, 2*256*256 + 37*256 + 2, -256 * 256 * 256 * 127}
require.Equal(t, specials, binToSpecialTokens(bytesArray))
}

func TestRuleToBin(t *testing.T) {
rule := rule{1, 2, 257}
bytesArray := []byte{0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1}
require.Equal(t, bytesArray, ruleToBin(rule))
}

func TestBinToRule(t *testing.T) {
rule := rule{1, 2, 257}
bytesArray := []byte{0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1}
require.Equal(t, rule, binToRule(bytesArray))
}

func TestReadModelFromBinary(t *testing.T) {
reader := bytes.NewReader([]byte{0, 0, 0, 5, 0, 0, 0, 4,
0, 0, 0, 99, 0, 0, 0, 6,
0, 0, 0, 98, 0, 0, 0, 7,
0, 0, 0, 95, 0, 0, 0, 4,
0, 0, 0, 100, 0, 0, 0, 5,
0, 0, 0, 97, 0, 0, 0, 8,
0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 9,
0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 10,
0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 11,
0, 0, 0, 4, 0, 0, 0, 7, 0, 0, 0, 12,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3})
expected := Model{
map[rune]TokenID{97: 8, 98: 7, 99: 6, 100: 5, 95: 4},
map[TokenID]rune{4: 95, 5: 100, 6: 99, 7: 98, 8: 97},
[]rule{{4, 8, 9}, {4, 6, 10}, {4, 5, 11}, {4, 7, 12}},
map[TokenID]EncodedToken{4: {4}, 5: {5}, 6: {6}, 7: {7}, 8: {8}, 9: {4, 8}, 10: {4, 6}, 11: {4, 5}, 12: {4, 7}},
map[string]TokenID{"a": 8, "b": 7, "c": 6, "d": 5, "_": 4,
"_a": 9, "_b": 12, "_c": 10, "_d": 11},
specialTokens{1, 0, 2, 3},
}
Comment thread
irinakhismatullina marked this conversation as resolved.
model, err := ReadModelFromBinary(reader)
require.NoError(t, err)
require.Equal(t, expected, *model)
}
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
module github.com/src-d/go-YouTokenToMe

go 1.12

require (
github.com/sirupsen/logrus v1.4.2
github.com/stretchr/testify v1.4.0
)
Empty file removed go.sum
Empty file.
7 changes: 0 additions & 7 deletions main.go

This file was deleted.