diff options
Diffstat (limited to 'src/gentables')
-rwxr-xr-x | src/gentables | 307 |
1 files changed, 307 insertions, 0 deletions
diff --git a/src/gentables b/src/gentables new file mode 100755 index 0000000..262169d --- /dev/null +++ b/src/gentables @@ -0,0 +1,307 @@ +#!/opt/bin/perl + +# the generated tables mostly have NOT been checked so far! + +use v5.8.2; +use utf8; +use Encode; +use Encode::JP; +use Encode::CN; +use Encode::KR; +use Encode::HanExtra; +use Encode::JIS2K; + +my $gen; + +$TO_UNICODE = 0; # also generate to_unicode tables + +sub linear { + my ($a, $l, $h, $b) = @_; + for ($l .. $h) { + return unless defined $a->[$_] && $a->[$_] == $_ + ($b - $l); + } + 1; +} + +sub wrap { + my $res = " "; + $res .= "$1\n " while $_[0] =~ /\G(.{90}\S*?)\s+/gc; + $res .= "$1" if $_[0] =~ /\G(.*)$/; + $res; +} + +my $last_tab_full; # hack + +sub gentab { + my ($enc, $l, $h, $f, $rep) = @_; + $last_tab_full = 0; + + " = {\n" + . (wrap join ", ", + map +(sprintf "0x$f", + defined $enc->[$_] ? $enc->[$_] : $last_tab_full++ * 0 + $rep + ), $l..$h) + . "\n};\n"; +} + +sub gen { + my ($enc, $base, $ch) = @_; + + my ($fun, $tab); + my (@t1, @t2); + + for (255, 159, 127, 126, 125) { + if (linear $enc, 0, $_, 0) { + undef $enc->[$_] for 0..$_; + $fun .= sprintf "if (· <= 0x%04x) return ·;\n", $_; + } + } + + for (126, 127, 128) { + if (linear $enc, $_, 159, $_) { + undef $enc->[$_] for $_..159; + $fun .= sprintf "if (0x%04x <= · && · <= 0x%04x) return ·;\n", $_, 159; + } + } + + for (126, 127) { + next unless defined $enc->[$_]; + $fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $_, delete $enc->[$_]; + } + + my @map; + my @map2; + + for (0 .. $#$enc) { + if (defined $enc->[$_]) { + $map[$_] = $enc->[$_]; + $map2[$_ >> 8]++; + } + } + + for my $p (0..255) { + if ($map2[$p]) { + my $b = $p << 8; + + my ($l, $h); + for my $i (0..255) { $l = $i, last if defined $map[$b + $i]; } + for my $i (0..255) { $h = 255 - $i, last if defined $map[$b + 255 - $i]; } + + if ($map2[$p] <= 5) { + for ($l .. $h) { + next unless defined $enc->[$b + $_]; + $fun .= sprintf "if (· == 0x%04x) return 0x%04x;\n", $b + $_, $enc->[$b + $_]; + } + } elsif (linear $enc, $b + $l, $b + $h, $enc->[$b + $l]) { + my $dif = $enc->[$b + $l] - ($b + $l); + $dif = $dif < 0 ? sprintf "- 0x%04x", -$dif : sprintf "+ 0x%04x", $dif; + $fun .= sprintf "if (· >= 0x%04x && 0x%04x >= ·) return · %s;\n", $b + $l, $b + $h, $dif; + } elsif ($map2[$p] <= 5) { # defunct + $fun .= "switch (·)\n {\n"; + for ($l .. $h) { + next unless defined $enc->[$b + $_]; + $fun .= sprintf " case 0x%04x: return 0x%04x;\n", $b + $_, $enc->[$b + $_]; + } + $fun .= " }\n"; + } else { + my ($i, $t, $f) = (0, "uint8_t", "%02x"); + for ($l .. $h) { + if ($enc->[$b + $_] > 255) { + ($i, $t, $f) = (1, "uint16_t", "%04x"); + last; + } + } + + $i ? push @t2, [$p, $l, $h, $t, $f] + : push @t1, [$b, $l, $h, $t, $f]; + } + + } + } + + if (@t2 <= 3) { + push @t1, @t2; + @t2 = (); + } + + for (@t1) { + my ($b, $l, $h, $t, $f) = @$_; + my $rep = $b + $l == 0 ? 128 : 0; + $tab .= "static const $t $base\_$b\[]" . gentab $enc, $b + $l, $b + $h, $f; + $fun .= sprintf "if (0x%04x <= · && · <= 0x%04x)\n" + . " return %s$base\_$b\[· - 0x%04x];\n", + $b + $l, $b + $h, + ($last_tab_full ? sprintf "$base\_$b\[· - 0x%04x] == $rep ? NOCHAR : ", $b + $l : ""), + $b + $l; + } + + if (@t2) { + my ($min, $max) = (255, 0); + my ($l, $h) = (255, 0); + for (@t2) { + my ($p, $L, $H, $t, $f) = @$_; + $max = $p if $p > $max; + $min = $p if $p < $min; + $l = $L if $L < $l; + $h = $H if $H > $h; + } + + $fun .= "uint8_t l = ·;\n"; + $fun .= "uint16_t h = · >> 8;\n"; + + if ($h - $l < 200) { + my $d = $h - $l + 1; + + my @enc; + for $p ($min .. $max) { + for $i ($l .. $h) { + $enc[($p - $min) * $d + $i - $l] = $enc->[$p * 256 + $i]; + } + } + $tab .= "static const uint16_t $base\_m[]" . gentab \@enc, 0, $#enc, "%04x"; + + if ($last_tab_full) { + $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" + . " && 0x%02x <= l && l <= 0x%02x)\n" + . " return $base\_m\[h * 0x%02x + l - 0x%04x]\n" + . " ? $base\_m\[h * 0x%02x + l - 0x%04x]\n" + . " : NOCHAR;\n", + $min, $max, $l, $h, + ($d, $min * $d + $l) x 2; + } else { + $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" + . " && 0x%02x <= l && l <= 0x%02x)\n" + . " return $base\_m\[h * 0x%02x + l - 0x%04x];\n", + $min, $max, $l, $h, + $d, + $min * $d + $l; + } + } else { + my @tab = (0) x ($max - $min); + for (@t2) { + my ($p, undef, undef, $t, $f) = @$_; + $tab .= "static const $t $base\_$p\[]" . gentab $enc, $p * 256 + $l, $p * 256 + $h, $f, 0; + $tab[$p - $min] = "$base\_$p"; + } + + $tab .= "const uint16_t *$base\_i[] = {\n" + . (wrap join ", ", @tab) + . "\n};\n\n"; + + $fun .= sprintf "if (0x%02x <= h && h <= 0x%02x\n" + . " && 0x%02x <= l && l <= 0x%02x\n" + . " && $base\_i[h - 0x%02x])\n" + . " return $base\_i\[h - 0x%02x][l - 0x%02x]\n" + . " ? $base\_i\[h - 0x%02x][l - 0x%02x]\n" + . " : NOCHAR;\n", + $min, $max, $l, $h, + $min, ($min, $l) x 2; + } + } + + $fun .= "return NOCHAR;\n"; + + $fun =~ s/·/$ch/g; + + ($tab, $fun); +} + +while (<DATA>) { + my ($group, $base, $cs, $type) = split /\s+/; + + my @enc1; + my @enc2; + + for (0 .. 65535) { + my $enc = encode $cs, (chr $_), Encode::FB_QUIET; + + if (length $enc) { + my $code = hex unpack "H*", $enc; + $enc1[$_] = $code; + $enc2[$code] = $_ unless defined $enc2[$code]; + } + } + + my ($tab1, $fun1); + my ($tab2, $fun2); + + ($tab1, $fun1) = gen \@enc1, "$base\_f", "unicode"; + ($tab2, $fun2) = gen \@enc2, "$base\_t", "enc" if $TO_UNICODE; + + $fun1 =~ s/^/ /gm; + $fun2 =~ s/^/ /gm; + + print "$base\n"; + open OUT, ">table/$base.h" or die; + + print OUT <<EOF; +// +// AUTOMATICALLLY GENERATED by gentables +// +#ifdef ENCODING_$group + +$tab1$tab2 +static uint32_t cs_$base\_from_unicode (unicode_t unicode) { +$fun1} +EOF + +if ($TO_UNICODE) { + print OUT <<EOF +#if ENCODING_TO_UNICODE +static unicode_t cs_$base\_to_unicode (uint32_t enc) { +$fun2} +#endif +EOF +} + +print OUT <<EOF; + +#else + +#define cs_$base\_from_unicode cs_unknown_from_unicode +#define cs_$base\_to_unicode cs_unknown_to_unicode + +#endif +EOF + + close OUT; +} + +__DATA__ +VN viscii viscii +KR ksc5601_1987_0 ksc5601-raw +ZH gb2312_1980_0 gb2312-raw +ZH gbk_0 gbk +ZH_EXT cns11643_1992_1 cns11643-1 +ZH_EXT cns11643_1992_2 cns11643-2 +ZH_EXT cns11643_1992_3 cns11643-3 +ZH_EXT cns11643_1992_4 cns11643-4 +ZH_EXT cns11643_1992_5 cns11643-5 +ZH_EXT cns11643_1992_6 cns11643-6 +ZH_EXT cns11643_1992_7 cns11643-7 +ZH_EXT cns11643_1992_f cns11643-f +ZH big5 big5 +ZH_EXT big5_ext big5ext +ZH_EXT big5_plus big5plus +EU koi8_r koi8-r +EU koi8_u koi8-u +DEFAULT iso8859_1 iso-8859-1 +EU iso8859_2 iso-8859-2 +EU iso8859_3 iso-8859-3 +EU iso8859_4 iso-8859-4 +EU iso8859_5 iso-8859-5 +EU iso8859_6 iso-8859-6 +EU iso8859_7 iso-8859-7 +EU iso8859_8 iso-8859-8 +EU iso8859_9 iso-8859-9 +EU iso8859_10 iso-8859-10 +EU iso8859_11 iso-8859-11 +EU iso8859_13 iso-8859-13 +EU iso8859_14 iso-8859-14 +DEFAULT iso8859_15 iso-8859-15 +EU iso8859_16 iso-8859-16 +JP jis0201_1976_0 jis0201-raw +JP jis0208_1990_0 jis0208-raw +JP jis0212_1990_0 jis0212-raw +JP_EXT jis0213_1 jis0213-1-raw +JP_EXT jis0213_2 jis0213-2-raw |