add vendor

This commit is contained in:
Malar Invention
2022-04-03 09:37:16 +05:30
parent f96ba5f172
commit 00ebcd295e
2339 changed files with 705854 additions and 0 deletions

View File

@ -0,0 +1,30 @@
package textseg
import (
"bufio"
"bytes"
)
// AllTokens is a utility that uses a bufio.SplitFunc to produce a slice of
// all of the recognized tokens in the given buffer.
func AllTokens(buf []byte, splitFunc bufio.SplitFunc) ([][]byte, error) {
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Split(splitFunc)
var ret [][]byte
for scanner.Scan() {
ret = append(ret, scanner.Bytes())
}
return ret, scanner.Err()
}
// TokenCount is a utility that uses a bufio.SplitFunc to count the number of
// recognized tokens in the given buffer.
func TokenCount(buf []byte, splitFunc bufio.SplitFunc) (int, error) {
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Split(splitFunc)
var ret int
for scanner.Scan() {
ret++
}
return ret, scanner.Err()
}

View File

@ -0,0 +1,7 @@
package textseg
//go:generate go run make_tables.go -output tables.go
//go:generate go run make_test_tables.go -output tables_test.go
//go:generate ruby unicode2ragel.rb --url=http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -m GraphemeCluster -p "Prepend,CR,LF,Control,Extend,Regional_Indicator,SpacingMark,L,V,T,LV,LVT,E_Base,E_Modifier,ZWJ,Glue_After_Zwj,E_Base_GAZ" -o grapheme_clusters_table.rl
//go:generate ragel -Z grapheme_clusters.rl
//go:generate gofmt -w grapheme_clusters.go

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,132 @@
package textseg
import (
"errors"
"unicode/utf8"
)
// Generated from grapheme_clusters.rl. DO NOT EDIT
%%{
# (except you are actually in grapheme_clusters.rl here, so edit away!)
machine graphclust;
write data;
}%%
var Error = errors.New("invalid UTF8 text")
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
// on grapheme cluster boundaries.
func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
if len(data) == 0 {
return 0, nil, nil
}
// Ragel state
cs := 0 // Current State
p := 0 // "Pointer" into data
pe := len(data) // End-of-data "pointer"
ts := 0
te := 0
act := 0
eof := pe
// Make Go compiler happy
_ = ts
_ = te
_ = act
_ = eof
startPos := 0
endPos := 0
%%{
include GraphemeCluster "grapheme_clusters_table.rl";
action start {
startPos = p
}
action end {
endPos = p
}
action emit {
return endPos+1, data[startPos:endPos+1], nil
}
ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
AnyExtender = Extend | ZWJGlue | SpacingMark;
Extension = AnyExtender*;
ReplacementChar = (0xEF 0xBF 0xBD);
CRLFSeq = CR LF;
ControlSeq = Control | ReplacementChar;
HangulSeq = (
L+ (((LV? V+ | LVT) T*)?|LV?) |
LV V* T* |
V+ T* |
LVT T* |
T+
) Extension;
EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
ZWJSeq = ZWJGlue Extension;
EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
UTF8Cont = 0x80 .. 0xBF;
AnyUTF8 = (
0x00..0x7F |
0xC0..0xDF . UTF8Cont |
0xE0..0xEF . UTF8Cont . UTF8Cont |
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
);
# OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;
# PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
CRLFTok = CRLFSeq >start @end;
ControlTok = ControlSeq >start @end;
HangulTok = HangulSeq >start @end;
EmojiTok = EmojiSeq >start @end;
ZWJTok = ZWJSeq >start @end;
EmojiFlagTok = EmojiFlagSeq >start @end;
OtherTok = OtherSeq >start @end;
PrependTok = PrependSeq >start @end;
main := |*
CRLFTok => emit;
ControlTok => emit;
HangulTok => emit;
EmojiTok => emit;
ZWJTok => emit;
EmojiFlagTok => emit;
PrependTok => emit;
OtherTok => emit;
# any single valid UTF-8 character would also be valid per spec,
# but we'll handle that separately after the loop so we can deal
# with requesting more bytes if we're not at EOF.
*|;
write init;
write exec;
}%%
// If we fall out here then we were unable to complete a sequence.
// If we weren't able to complete a sequence then either we've
// reached the end of a partial buffer (so there's more data to come)
// or we have an isolated symbol that would normally be part of a
// grapheme cluster but has appeared in isolation here.
if !atEOF {
// Request more
return 0, nil, nil
}
// Just take the first UTF-8 sequence and return that.
_, seqLen := utf8.DecodeRune(data)
return seqLen, data[:seqLen], nil
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,335 @@
#!/usr/bin/env ruby
#
# This scripted has been updated to accept more command-line arguments:
#
# -u, --url URL to process
# -m, --machine Machine name
# -p, --properties Properties to add to the machine
# -o, --output Write output to file
#
# Updated by: Marty Schoch <marty.schoch@gmail.com>
#
# This script uses the unicode spec to generate a Ragel state machine
# that recognizes unicode alphanumeric characters. It generates 5
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
# Currently supported encodings are UTF-8 [default] and UCS-4.
#
# Usage: unicode2ragel.rb [options]
# -e, --encoding [ucs4 | utf8] Data encoding
# -h, --help Show this message
#
# This script was originally written as part of the Ferret search
# engine library.
#
# Author: Rakan El-Khalil <rakan@well.com>
require 'optparse'
require 'open-uri'
ENCODINGS = [ :utf8, :ucs4 ]
ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
DEFAULT_MACHINE_NAME= "WChar"
###
# Display vars & default option
TOTAL_WIDTH = 80
RANGE_WIDTH = 23
@encoding = :utf8
@chart_url = DEFAULT_CHART_URL
machine_name = DEFAULT_MACHINE_NAME
properties = []
@output = $stdout
###
# Option parsing
cli_opts = OptionParser.new do |opts|
opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
@encoding = o.downcase.to_sym
end
opts.on("-h", "--help", "Show this message") do
puts opts
exit
end
opts.on("-u", "--url URL", "URL to process") do |o|
@chart_url = o
end
opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
machine_name = o
end
opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
properties = o
end
opts.on("-o", "--output FILE", "output file") do |o|
@output = File.new(o, "w+")
end
end
cli_opts.parse(ARGV)
unless ENCODINGS.member? @encoding
puts "Invalid encoding: #{@encoding}"
puts cli_opts
exit
end
##
# Downloads the document at url and yields every alpha line's hex
# range and description.
def each_alpha( url, property )
open( url ) do |file|
file.each_line do |line|
next if line =~ /^#/;
next if line !~ /; #{property} #/;
range, description = line.split(/;/)
range.strip!
description.gsub!(/.*#/, '').strip!
if range =~ /\.\./
start, stop = range.split '..'
else start = stop = range
end
yield start.hex .. stop.hex, description
end
end
end
###
# Formats to hex at minimum width
def to_hex( n )
r = "%0X" % n
r = "0#{r}" unless (r.length % 2).zero?
r
end
###
# UCS4 is just a straight hex conversion of the unicode codepoint.
def to_ucs4( range )
rangestr = "0x" + to_hex(range.begin)
rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
[ rangestr ]
end
##
# 0x00 - 0x7f -> 0zzzzzzz[7]
# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
def to_utf8_enc( n )
r = 0
if n <= 0x7f
r = n
elsif n <= 0x7ff
y = 0xc0 | (n >> 6)
z = 0x80 | (n & 0x3f)
r = y << 8 | z
elsif n <= 0xffff
x = 0xe0 | (n >> 12)
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = x << 16 | y << 8 | z
elsif n <= 0x10ffff
w = 0xf0 | (n >> 18)
x = 0x80 | (n >> 12) & 0x3f
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = w << 24 | x << 16 | y << 8 | z
end
to_hex(r)
end
def from_utf8_enc( n )
n = n.hex
r = 0
if n <= 0x7f
r = n
elsif n <= 0xdfff
y = (n >> 8) & 0x1f
z = n & 0x3f
r = y << 6 | z
elsif n <= 0xefffff
x = (n >> 16) & 0x0f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = x << 10 | y << 6 | z
elsif n <= 0xf7ffffff
w = (n >> 24) & 0x07
x = (n >> 16) & 0x3f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = w << 18 | x << 12 | y << 6 | z
end
r
end
###
# Given a range, splits it up into ranges that can be continuously
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
# This is not strictly needed since the current [5.1] unicode standard
# doesn't have ranges that straddle utf8 boundaries. This is included
# for completeness as there is no telling if that will ever change.
def utf8_ranges( range )
ranges = []
UTF8_BOUNDARIES.each do |max|
if range.begin <= max
if range.end <= max
ranges << range
return ranges
end
ranges << (range.begin .. max)
range = (max + 1) .. range.end
end
end
ranges
end
def build_range( start, stop )
size = start.size/2
left = size - 1
return [""] if size < 1
a = start[0..1]
b = stop[0..1]
###
# Shared prefix
if a == b
return build_range(start[2..-1], stop[2..-1]).map do |elt|
"0x#{a} " + elt
end
end
###
# Unshared prefix, end of run
return ["0x#{a}..0x#{b} "] if left.zero?
###
# Unshared prefix, not end of run
# Range can be 0x123456..0x56789A
# Which is equivalent to:
# 0x123456 .. 0x12FFFF
# 0x130000 .. 0x55FFFF
# 0x560000 .. 0x56789A
ret = []
ret << build_range(start, a + "FF" * left)
###
# Only generate middle range if need be.
if a.hex+1 != b.hex
max = to_hex(b.hex - 1)
max = "FF" if b == "FF"
ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
end
###
# Don't generate last range if it is covered by first range
ret << build_range(b + "00" * left, stop) unless b == "FF"
ret.flatten!
end
def to_utf8( range )
utf8_ranges( range ).map do |r|
begin_enc = to_utf8_enc(r.begin)
end_enc = to_utf8_enc(r.end)
build_range begin_enc, end_enc
end.flatten!
end
##
# Perform a 3-way comparison of the number of codepoints advertised by
# the unicode spec for the given range, the originally parsed range,
# and the resulting utf8 encoded range.
def count_codepoints( code )
code.split(' ').inject(1) do |acc, elt|
if elt =~ /0x(.+)\.\.0x(.+)/
if @encoding == :utf8
acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
else
acc * ($2.hex - $1.hex + 1)
end
else
acc
end
end
end
def is_valid?( range, desc, codes )
spec_count = 1
spec_count = $1.to_i if desc =~ /\[(\d+)\]/
range_count = range.end - range.begin + 1
sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
sum == spec_count and sum == range_count
end
##
# Generate the state maching to stdout
def generate_machine( name, property )
pipe = " "
@output.puts " #{name} = "
each_alpha( @chart_url, property ) do |range, desc|
codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
#raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
# is_valid? range, desc, codes
range_width = codes.map { |a| a.size }.max
range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
if desc.size > desc_width
desc = desc[0..desc_width - 4] + "..."
end
codes.each_with_index do |r, idx|
desc = "" unless idx.zero?
code = "%-#{range_width}s" % r
@output.puts " #{pipe} #{code} ##{desc}"
pipe = "|"
end
end
@output.puts " ;"
@output.puts ""
end
@output.puts <<EOF
# The following Ragel file was autogenerated with #{$0}
# from: #{@chart_url}
#
# It defines #{properties}.
#
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
# and that your input is in #{@encoding}.
%%{
machine #{machine_name};
EOF
properties.each { |x| generate_machine( x, x ) }
@output.puts <<EOF
}%%
EOF

View File

@ -0,0 +1,19 @@
package textseg
import "unicode/utf8"
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
// on UTF8 sequence boundaries.
//
// This is included largely for completeness, since this behavior is already
// built in to Go when ranging over a string.
func ScanUTF8Sequences(data []byte, atEOF bool) (int, []byte, error) {
if len(data) == 0 {
return 0, nil, nil
}
r, seqLen := utf8.DecodeRune(data)
if r == utf8.RuneError && !atEOF {
return 0, nil, nil
}
return seqLen, data[:seqLen], nil
}