#!/usr/bin/env perl
# $Id: script-bad-utf8.html,v 1.2 2020/08/17 17:44:19 tom Exp $
# -----------------------------------------------------------------------------
# Copyright 2020 by Thomas E. Dickey
#
# All Rights Reserved
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the name(s) of the above copyright
# holders shall not be used in advertising or otherwise to promote the
# sale, use or other dealings in this Software without prior written
# authorization.
# -----------------------------------------------------------------------------
# Use the cursor-position response to make a copy of Markus Kuhn's sample of
# bad UTF-8 which is adjusted for a given terminal. Also, update a CSV file
# which shows the number of differences against the target file.
#
# NOTE: when running this script, it helps to first reset the terminal, since
# most terminal emulators other than xterm have fragile/stateful error handling,
# and will produce inconsistent results.
#
# Further reading
# http://unicode.org/mail-arch/unicode-ml/Archives-Old/UML015/0145.html
# "UTF-8 stress test"
# by Markus Kuhn
#
# https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
# "UTF-8 decoder capability and stress test"
# by Markus Kuhn
#
# https://hsivonen.fi/broken-utf-8/
# "How Many REPLACEMENT CHARACTERs?"
# by Henri Sivonen
#
# http://www.unicode.org/L2/L2019/19192-review-docs.pdf
# "Review of Unicode 2018 Henri Sivonen docs"
# by Markus Scherer
# TODO: sort cases as version-strings
use strict;
use warnings;
use Term::ReadKey;
use Getopt::Std;
use Text::CSV;
our ( $opt_d, $opt_i, $opt_m, $opt_n, $opt_q, $opt_r, $opt_s, $opt_t );
our $destdir = "/users/ftp/httpdocs/xterm/bad-utf8";
our $rootname = "UTF-8-test";
our $sourcefile = "$rootname.txt";
our @sourcefile;
our $tty;
our $crlf = "\r\n";
# This is a hash of testcases into the data for the rows.
# The data for each row hashes column header (terminal name) to adjustments.
our %csv_file;
our ( $oldx, $oldy );
our ( $maxx, $maxy );
sub abspath($) {
my $path = shift;
$path = "$destdir/$path" unless ( $path =~ /^(\.){0,2}\// );
return $path;
}
sub filesize($) {
my $filename = shift;
my (
$dev, $ino, $mode, $nlink, $uid, $gid, $rdev,
$size, $atime, $mtime, $ctime, $blksize, $blocks
) = stat($filename);
return $size;
}
sub readfile($) {
my $filename = shift;
my $encoding = ":raw :bytes";
open( my $fh, "< $encoding", $filename ) || do {
print STDERR "Can't open $filename: $!\n";
exit;
};
binmode($fh);
read $fh, my $bytes, &filesize($filename);
close($fh);
@sourcefile = split /\n/, $bytes;
}
sub writefile($$) {
my $filename = $_[0];
my @data = @{ $_[1] };
my $encoding = ":raw :bytes";
open( my $fh, "> $encoding", $filename ) || do {
print STDERR "Can't open $filename: $!\n";
exit;
};
binmode($fh);
use bytes;
for my $n ( 0 .. $#data ) {
print $fh $data[$n] . "\n";
}
no bytes;
close($fh);
}
sub get_reply($) {
my $command = $_[0];
my $reply = "";
syswrite $tty, $command;
while (1) {
my $test = ReadKey 0.10;
last if not defined $test;
$reply .= $test;
}
return $reply;
}
sub move($$) {
my $y = $_[0] + 1;
my $x = $_[1] + 1;
syswrite $tty, "\033[$y;$x" . "H";
}
sub clear() {
syswrite $tty, "\033[J";
}
sub whereami() {
my @coords = ( 0, 0 );
my $reply = &get_reply("\033[6n");
my $y = 0;
my $x = 0;
if ( $reply =~ /^\033\[\d+;\d+R$/ ) {
$reply =~ s/^\033\[//;
$reply =~ s/R//;
my @coords = split /;/, $reply;
$y = $coords[0];
$x = $coords[1];
}
return ( $y, $x );
}
sub newline() {
if ($opt_q) {
&move( $oldy - 1, $oldx - 1 );
&clear;
}
else {
syswrite $tty, $crlf;
}
}
sub csv_file() {
return &abspath("$rootname.csv");
}
sub read_csv() {
my $csv = Text::CSV->new( { binary => 1 } ) # should set binary attribute.
or die "Cannot use CSV: " . Text::CSV->error_diag();
if ( open my $fh, "<", &csv_file ) {
my $first = 1;
my @head;
while ( my $row = $csv->getline($fh) ) {
if ($first) {
@head = @$row;
$first = 0;
}
else {
my @data = @$row;
next if ( $data[0] !~ /^\d+(\.\d+)*$/ );
my %data;
for my $c ( 0 .. $#data ) {
$data{ $head[$c] } = $data[$c];
}
$csv_file{ $data[0] } = \%data;
}
}
$csv->eof or $csv->error_diag();
close $fh;
}
}
sub write_csv() {
my $csv = Text::CSV->new( { binary => 1 } ) # should set binary attribute.
or die "Cannot use CSV: " . Text::CSV->error_diag();
if ( open my $fh, ">", &csv_file ) {
my $first = 1;
my @head;
my @sums;
for my $data ( sort keys %csv_file ) {
my %data = %{ $csv_file{$data} };
if ($first) {
$first = 0;
# "(tests)" will sort on the left
@head = ( sort keys %data );
$csv->print( $fh, $_ ) for \@head;
print $fh "\n";
}
my @data;
for my $h ( 0 .. $#head ) {
$data[$h] = $data{ $head[$h] };
$sums[$h] += $data{ $head[$h] } if ( $h > 0 );
}
$csv->print( $fh, $_ ) for \@data;
print $fh "\n";
}
$sums[0] = "totals";
$csv->print( $fh, $_ ) for \@sums;
print $fh "\n";
close $fh;
}
}
sub analyze($) {
my $target = shift;
my @targetfile;
my %mismatches;
my $GOAL = 80;
my $quit = 0;
open $tty, ">", "/dev/tty" or die "Cannot open /dev/tty ";
binmode($tty);
ReadMode 'ultra-raw';
( $oldy, $oldx ) = &whereami;
&move( 9999, 9999 );
( $maxy, $maxx ) = &whereami;
&move( $oldy - 1, $oldx - 1 );
my $testcase = "";
for my $row ( 0 .. $#sourcefile ) {
$targetfile[$row] = $sourcefile[$row];
if ( $sourcefile[$row] =~ /^\d+(\.\d+){0,2}\b/ ) {
$testcase = $sourcefile[$row];
$testcase =~ s/[^\d\.].*$//;
$mismatches{$testcase} = 0;
}
syswrite $tty, $sourcefile[$row];
use bytes;
my $actual = length( $sourcefile[$row] );
my $marker = index( $sourcefile[$row], "|" );
if ( $actual <= 0 or $marker < $actual - 1 ) {
if ( $actual == 79
and substr( $sourcefile[$row], $actual - 1 ) eq " "
and ( $testcase eq "2.1.1" or $testcase eq "2.2.1" ) )
{
# Markus' first version in September 1999 was okay, but he
# omitted the marker on all subsequent versions: the last
# character is a blank rather than the vertical bar. Here is a
# simple workaround.
$sourcefile[$row] =
substr( $sourcefile[$row], 0, $actual - 1 ) . "|";
$targetfile[$row] = $sourcefile[$row];
}
else {
&newline;
next;
}
}
my ( $nowy, $nowx ) = &whereami;
# try to handle wrapping, but best results are on a wide terminal
$nowx += $maxx if ( $nowx < ( $maxx / 4 ) );
syswrite $tty, "$nowx:$actual" if ($opt_d);
if ( $nowx != $GOAL ) {
if ($opt_m) {
my $mods = ( $GOAL - $nowx );
$mods = -$mods if ( $mods < 0 );
$mismatches{$testcase} += $mods;
}
else {
$mismatches{$testcase}++;
}
&newline;
while ( length( $targetfile[$row] ) < $GOAL ) {
my $part =
substr( $targetfile[$row], 0,
length( $targetfile[$row] ) - 1 )
. " |";
$targetfile[$row] = $part;
}
my $endbar = "";
my $aligns = "|";
my $limits = length( $targetfile[$row] );
while ( length($aligns) < $limits ) {
my $check =
substr( $targetfile[$row], $limits - length($aligns) );
last if ( $check ne $aligns );
$endbar = $aligns;
$aligns = " " . $aligns;
}
my $have = length($endbar);
if ( $nowx > $GOAL ) {
my $want = 1 + ( $nowx - $GOAL );
my $diff = 0;
if ( $want > $have ) {
$want = $have;
}
$targetfile[$row] =
substr( $targetfile[$row], 0, $actual - $want ) . "|";
}
else {
my $want = ( $GOAL - $nowx );
my $diff = 0;
$diff = $want - 1;
$want = 1;
my $spaces = sprintf( "%*s|", $diff + $want, " " );
$targetfile[$row] =
substr( $targetfile[$row], 0, $actual - $want ) . $spaces;
}
if ($opt_r) {
&move( $nowy - 2, 0 ) if ( $nowy == $maxy );
&move( $nowy - 1, 0 ) if ( $nowy != $maxy );
&clear;
}
syswrite $tty, $targetfile[$row];
syswrite $tty, "NEW" if ($opt_d);
}
no bytes;
if ($opt_i) {
my $key = ReadKey 0;
if ( $key eq "q" ) {
$quit = 1;
last;
}
&newline;
}
else {
&newline;
}
}
ReadMode 'restore';
syswrite $tty, $crlf;
return if ($quit);
# Record the adjustments
my $targetfile = &abspath("$rootname:$target.txt");
&writefile( $targetfile, \@targetfile );
# Record the information to allow comparing different terminals
&read_csv;
for my $testcase ( keys %mismatches ) {
my %data;
%data = %{ $csv_file{$testcase} } if ( defined $csv_file{$testcase} );
$data{"(tests)"} = $testcase;
$data{$target} = $mismatches{$testcase};
$csv_file{$testcase} = \%data;
}
&write_csv;
}
sub main::HELP_MESSAGE() {
printf STDERR <<EOF
Usage: $0 [options] target
Options:
-d debugging shows details in margin (needs wide terminal)
-i interactive, wait after each line (q exits)
-m show the amount of modification rather than the number of failures
-n do not update the target file
-q quiet (do not scroll; just process on a single line)
-r replace current line rather than adding changed-line
-s source use a different source than
$sourcefile
-t dir use different directory for source/analysis files than
$destdir
EOF
;
exit;
}
$Getopt::Std::STANDARD_HELP_VERSION = 1;
&getopts('dimnqrs:t:') || &main::HELP_MESSAGE;
&main::HELP_MESSAGE if ( $opt_q and $opt_r );
$destdir = $opt_t if ($opt_t);
$sourcefile = $opt_s if ($opt_s);
$sourcefile = &abspath($sourcefile);
$rootname = $sourcefile;
$rootname =~ s,\.[^./]*$,,;
$rootname =~ s,^.*/,,;
$rootname .= "-m" if ($opt_m);
&readfile($sourcefile);
if ( -t 0 and -t 2 ) {
printf "** Interactive$crlf";
autoflush STDOUT 1;
&main::HELP_MESSAGE unless ( $#ARGV == 0 );
&analyze( $ARGV[0] );
}
else {
printf "** Non-Interactive\n";
for my $n ( 0 .. $#sourcefile ) {
printf( "%d:%d:%s\n",
$n + 1, length( $sourcefile[$n] ),
$sourcefile[$n] );
}
}
1;