#!/usr/bin/env perl
# $Id: script-diff-utf8.html,v 1.2 2020/08/17 17:44:19 tom Exp $
# -----------------------------------------------------------------------------
# Copyright 2020 by Thomas E. Dickey
#
#                         All Rights Reserved
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the name(s) of the above copyright
# holders shall not be used in advertising or otherwise to promote the
# sale, use or other dealings in this Software without prior written
# authorization.
# -----------------------------------------------------------------------------
# A directory of results from the bad-utf8 script will have a set of logfiles
# with the terminal-name after ":", e.g.,
#       UTF-8-test:foobar.txt
#
# and (for completeness) a logfile generated using the "-m" option:
#       UTF-8-test-m:foobar.txt
#
# If both are present, those should be identical.  Both are collected since
# many terminal emulators have poor error-recovery, and must be reset before
# collecting logs showing how they respond to ill-formed UTF-8.
#
# This script ensures that they are identical, and then computes the number of
# lines different between each combination of terminal emulator.  The resulting
# matrix can show which terminals have the same (but different) behavior from
# other groups of terminals.

use strict;
use warnings;
use Getopt::Std;
use Text::CSV;

our ( $opt_o, $opt_p, $opt_s, $opt_t );

our $destdir    = "/users/ftp/httpdocs/xterm/bad-utf8";
our $rootname   = "UTF-8-test";
our $sourcefile = "$rootname.txt";

our %terminals;    # hash from terminal name to filename

sub failed($) {
    my $msg = shift;
    STDOUT->flush;
    printf STDERR "? %s\n", $msg;
    exit 1;
}

sub abspath($) {
    my $path = shift;
    $path = "$destdir/$path" unless ( $path =~ /^(\.){0,2}\// );
    return $path;
}

sub filesize($) {
    my $filename = shift;
    my (
        $dev,  $ino,   $mode,  $nlink, $uid,     $gid, $rdev,
        $size, $atime, $mtime, $ctime, $blksize, $blocks
    ) = stat($filename);
    return $size;
}

sub readfile($) {
    my $filename = shift;
    my $encoding = ":raw :bytes";
    open( my $fh, "< $encoding", $filename ) || do {
        print STDERR "Can't open $filename: $!\n";
        exit;
    };
    binmode($fh);
    read $fh, my $bytes, &filesize($filename);
    close($fh);
    my @result = split /\n/, $bytes;
    return \@result;
}

sub section($$) {
    my $line   = shift;
    my $part   = shift;
    my $result = $part;
    if ( $line =~ /^\d(\.\d+)*\s/ ) {
        $result = $line;
        $result =~ s/\s.*//;
    }
    return $result;
}

# return hash of the line-numbers which differ between the two files
sub difference($$) {
    my $ref = shift;
    my $cmp = shift;
    my %result;
    my @ref = @{ &readfile($ref) };
    my @cmp = @{ &readfile($cmp) };
    my $max = $#ref;
    $max = $#cmp if ( $#cmp < $max );
    my $section = "";

    for my $n ( 0 .. $max ) {
        $section = &section( $ref[$n], $section );
        if ( $cmp[$n] ne $ref[$n] ) {
            $result{$n} = $section;
        }
    }
    if ( $#ref > $#cmp ) {
        for my $n ( $#cmp .. $#ref ) {
            $section = &section( $cmp[$n], $section );
            $result{$n} = $section;
        }
    }
    elsif ( $#cmp > $#ref ) {
        for my $n ( $#ref .. $#cmp ) {
            $section = &section( $ref[$n], $section );
            $result{$n} = $section;
        }
    }
    return \%result;
}

# return the number of lines which are different between the two files
sub compare($$) {
    my $ref  = shift;
    my $cmp  = shift;
    my %diff = %{ &difference( $ref, $cmp ) };
    my @diff = keys %diff;
    return $#diff + 1;
}

# Show differences in plain-text
sub report_all_plain($) {
    my $widest = shift;
    my @keys   = ( sort keys %terminals );
    for my $row (@keys) {
        printf "%-*s'", 1 + $widest, $row;
        for my $col (@keys) {
            printf "%3d", &compare( $terminals{$row}, $terminals{$col} );
        }
        printf "\n";
    }
    for my $col ( 0 .. 4 + $widest + $#keys * 3 ) {
        printf "-";
    }
    printf "\n";
    for my $row ( 0 .. $widest - 1 ) {
        printf "%-*s'", 1 + $widest, " ";
        for my $col ( 0 .. $#keys ) {
            my $key = $keys[$col];
            my $chr = " ";
            $chr = substr( $key, $row, 1 ) if ( $row < length($key) );
            printf "  %s", $chr;
        }
        printf "\n";
    }
}

# Show pairwise differences in plain-text
sub report_pair_plain() {
    printf "** pairwise report\n";
    my %groups;
    my @terms = sort keys %terminals;
    for my $level ( 0 .. $opt_p ) {
        printf ".. level %d\n", $level;
        for my $p ( 0 .. $#terms ) {
            for my $q ( 0 .. $p - 1 ) {
                my $r =
                  &compare( $terminals{ $terms[$p] },
                    $terminals{ $terms[$q] } );
                if ( $r == $level ) {
                    printf "%s vs %s", $terms[$p], $terms[$q];
                    if ( $level != 0 ) {
                        printf " (";
                        my %r = %{
                            &difference(
                                $terminals{ $terms[$p] },
                                $terminals{ $terms[$q] }
                            )
                        };
                        my $mark = "";
                        for $r ( sort { $a <=> $b } keys %r ) {
                            printf "%s%s", $mark, $r{$r};
                            $mark = ", ";
                        }
                        printf ")";
                    }
                    printf "\n";
                }
            }
        }
    }
}

# Show differences in CSV
sub report_all_csv($) {
    my $fp     = $_[0];
    my $widest = 12;
    my @keys   = ( sort keys %terminals );
    printf $fp "(compare)";
    for my $col (@keys) {
        printf $fp ",%s", $col;
    }
    printf $fp "\n";
    for my $row (@keys) {
        printf $fp "%s", $row;
        for my $col (@keys) {
            printf $fp ",%d", &compare( $terminals{$row}, $terminals{$col} );
        }
        printf $fp "\n";
    }
}

sub find_terminals() {
    use bytes;
    my $widest = 0;
    if ( opendir( my $dh, $destdir ) ) {
        my @entries = sort readdir($dh);
        closedir $dh;
        for my $n ( 0 .. $#entries ) {
            next if ( $entries[$n] =~ /^\.\.?$/ );
            next unless ( $entries[$n] =~ /:.*\.txt$/ );
            my $path = sprintf( "%s/%s", $destdir, $entries[$n] );
            next if ( -l $path );
            next unless ( -f $path );

            # printf "%s\n", $path;
            my $testroot = $entries[$n];
            $testroot =~ s,^.*/,,;
            $testroot =~ s,(-[[:alpha:]])?:.*,,;
            next unless ( $testroot eq $rootname );

            my $terminal = $entries[$n];
            $terminal =~ s/^.*://;
            $terminal =~ s/\.txt$//;

            if ( defined $terminals{$terminal} ) {
                &failed("mismatch $terminal")
                  if ( &compare( $path, $terminals{$terminal} ) != 0 );
            }
            else {
                $terminals{$terminal} = $path;
                $widest = length($terminal) if ( length($terminal) > $widest );
            }
        }
    }

    if ($opt_o) {
        my $fh;
        open( $fh, ">", $opt_o ) or &failed("open $opt_o");
        printf "** $opt_o\n";
        &report_all_csv($fh);
        close $fh;
    }
    else {
        &report_all_plain($widest) unless ($opt_p);
        &report_pair_plain($widest) if ($opt_p);
    }
    no bytes;
}

sub main::HELP_MESSAGE() {
    printf STDERR <<EOF
Usage: $0 [options] target

Options:

-o output  write table to output in CSV format
-p limit   report groups that fit into pairwise match
-s source  use a different source than
           $sourcefile
-t dir     use different directory for source/analysis files than
           $destdir
EOF
      ;
    exit;
}

$Getopt::Std::STANDARD_HELP_VERSION = 1;
&getopts('o:p:s:t:') || &main::HELP_MESSAGE;
if ($opt_p) {
    &main::HELP_MESSAGE unless ( $opt_p =~ /^\d+$/ );
}

$destdir    = $opt_t if ($opt_t);
$sourcefile = $opt_s if ($opt_s);
$sourcefile = &abspath($sourcefile);

$rootname = $sourcefile;
$rootname =~ s,\.[^./]*$,,;
$rootname =~ s,^.*/,,;

&find_terminals;

1;