#!/usr/bin/env perl
# $Id: script-diff-utf8.html,v 1.2 2020/08/17 17:44:19 tom Exp $
# -----------------------------------------------------------------------------
# Copyright 2020 by Thomas E. Dickey
#
# All Rights Reserved
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the name(s) of the above copyright
# holders shall not be used in advertising or otherwise to promote the
# sale, use or other dealings in this Software without prior written
# authorization.
# -----------------------------------------------------------------------------
# A directory of results from the bad-utf8 script will have a set of logfiles
# with the terminal-name after ":", e.g.,
# UTF-8-test:foobar.txt
#
# and (for completeness) a logfile generated using the "-m" option:
# UTF-8-test-m:foobar.txt
#
# If both are present, those should be identical. Both are collected since
# many terminal emulators have poor error-recovery, and must be reset before
# collecting logs showing how they respond to ill-formed UTF-8.
#
# This script ensures that they are identical, and then computes the number of
# lines different between each combination of terminal emulator. The resulting
# matrix can show which terminals have the same (but different) behavior from
# other groups of terminals.
use strict;
use warnings;
use Getopt::Std;
use Text::CSV;
our ( $opt_o, $opt_p, $opt_s, $opt_t );
our $destdir = "/users/ftp/httpdocs/xterm/bad-utf8";
our $rootname = "UTF-8-test";
our $sourcefile = "$rootname.txt";
our %terminals; # hash from terminal name to filename
sub failed($) {
my $msg = shift;
STDOUT->flush;
printf STDERR "? %s\n", $msg;
exit 1;
}
sub abspath($) {
my $path = shift;
$path = "$destdir/$path" unless ( $path =~ /^(\.){0,2}\// );
return $path;
}
sub filesize($) {
my $filename = shift;
my (
$dev, $ino, $mode, $nlink, $uid, $gid, $rdev,
$size, $atime, $mtime, $ctime, $blksize, $blocks
) = stat($filename);
return $size;
}
sub readfile($) {
my $filename = shift;
my $encoding = ":raw :bytes";
open( my $fh, "< $encoding", $filename ) || do {
print STDERR "Can't open $filename: $!\n";
exit;
};
binmode($fh);
read $fh, my $bytes, &filesize($filename);
close($fh);
my @result = split /\n/, $bytes;
return \@result;
}
sub section($$) {
my $line = shift;
my $part = shift;
my $result = $part;
if ( $line =~ /^\d(\.\d+)*\s/ ) {
$result = $line;
$result =~ s/\s.*//;
}
return $result;
}
# return hash of the line-numbers which differ between the two files
sub difference($$) {
my $ref = shift;
my $cmp = shift;
my %result;
my @ref = @{ &readfile($ref) };
my @cmp = @{ &readfile($cmp) };
my $max = $#ref;
$max = $#cmp if ( $#cmp < $max );
my $section = "";
for my $n ( 0 .. $max ) {
$section = §ion( $ref[$n], $section );
if ( $cmp[$n] ne $ref[$n] ) {
$result{$n} = $section;
}
}
if ( $#ref > $#cmp ) {
for my $n ( $#cmp .. $#ref ) {
$section = §ion( $cmp[$n], $section );
$result{$n} = $section;
}
}
elsif ( $#cmp > $#ref ) {
for my $n ( $#ref .. $#cmp ) {
$section = §ion( $ref[$n], $section );
$result{$n} = $section;
}
}
return \%result;
}
# return the number of lines which are different between the two files
sub compare($$) {
my $ref = shift;
my $cmp = shift;
my %diff = %{ &difference( $ref, $cmp ) };
my @diff = keys %diff;
return $#diff + 1;
}
# Show differences in plain-text
sub report_all_plain($) {
my $widest = shift;
my @keys = ( sort keys %terminals );
for my $row (@keys) {
printf "%-*s'", 1 + $widest, $row;
for my $col (@keys) {
printf "%3d", &compare( $terminals{$row}, $terminals{$col} );
}
printf "\n";
}
for my $col ( 0 .. 4 + $widest + $#keys * 3 ) {
printf "-";
}
printf "\n";
for my $row ( 0 .. $widest - 1 ) {
printf "%-*s'", 1 + $widest, " ";
for my $col ( 0 .. $#keys ) {
my $key = $keys[$col];
my $chr = " ";
$chr = substr( $key, $row, 1 ) if ( $row < length($key) );
printf " %s", $chr;
}
printf "\n";
}
}
# Show pairwise differences in plain-text
sub report_pair_plain() {
printf "** pairwise report\n";
my %groups;
my @terms = sort keys %terminals;
for my $level ( 0 .. $opt_p ) {
printf ".. level %d\n", $level;
for my $p ( 0 .. $#terms ) {
for my $q ( 0 .. $p - 1 ) {
my $r =
&compare( $terminals{ $terms[$p] },
$terminals{ $terms[$q] } );
if ( $r == $level ) {
printf "%s vs %s", $terms[$p], $terms[$q];
if ( $level != 0 ) {
printf " (";
my %r = %{
&difference(
$terminals{ $terms[$p] },
$terminals{ $terms[$q] }
)
};
my $mark = "";
for $r ( sort { $a <=> $b } keys %r ) {
printf "%s%s", $mark, $r{$r};
$mark = ", ";
}
printf ")";
}
printf "\n";
}
}
}
}
}
# Show differences in CSV
sub report_all_csv($) {
my $fp = $_[0];
my $widest = 12;
my @keys = ( sort keys %terminals );
printf $fp "(compare)";
for my $col (@keys) {
printf $fp ",%s", $col;
}
printf $fp "\n";
for my $row (@keys) {
printf $fp "%s", $row;
for my $col (@keys) {
printf $fp ",%d", &compare( $terminals{$row}, $terminals{$col} );
}
printf $fp "\n";
}
}
sub find_terminals() {
use bytes;
my $widest = 0;
if ( opendir( my $dh, $destdir ) ) {
my @entries = sort readdir($dh);
closedir $dh;
for my $n ( 0 .. $#entries ) {
next if ( $entries[$n] =~ /^\.\.?$/ );
next unless ( $entries[$n] =~ /:.*\.txt$/ );
my $path = sprintf( "%s/%s", $destdir, $entries[$n] );
next if ( -l $path );
next unless ( -f $path );
# printf "%s\n", $path;
my $testroot = $entries[$n];
$testroot =~ s,^.*/,,;
$testroot =~ s,(-[[:alpha:]])?:.*,,;
next unless ( $testroot eq $rootname );
my $terminal = $entries[$n];
$terminal =~ s/^.*://;
$terminal =~ s/\.txt$//;
if ( defined $terminals{$terminal} ) {
&failed("mismatch $terminal")
if ( &compare( $path, $terminals{$terminal} ) != 0 );
}
else {
$terminals{$terminal} = $path;
$widest = length($terminal) if ( length($terminal) > $widest );
}
}
}
if ($opt_o) {
my $fh;
open( $fh, ">", $opt_o ) or &failed("open $opt_o");
printf "** $opt_o\n";
&report_all_csv($fh);
close $fh;
}
else {
&report_all_plain($widest) unless ($opt_p);
&report_pair_plain($widest) if ($opt_p);
}
no bytes;
}
sub main::HELP_MESSAGE() {
printf STDERR <<EOF
Usage: $0 [options] target
Options:
-o output write table to output in CSV format
-p limit report groups that fit into pairwise match
-s source use a different source than
$sourcefile
-t dir use different directory for source/analysis files than
$destdir
EOF
;
exit;
}
$Getopt::Std::STANDARD_HELP_VERSION = 1;
&getopts('o:p:s:t:') || &main::HELP_MESSAGE;
if ($opt_p) {
&main::HELP_MESSAGE unless ( $opt_p =~ /^\d+$/ );
}
$destdir = $opt_t if ($opt_t);
$sourcefile = $opt_s if ($opt_s);
$sourcefile = &abspath($sourcefile);
$rootname = $sourcefile;
$rootname =~ s,\.[^./]*$,,;
$rootname =~ s,^.*/,,;
&find_terminals;
1;