#!/usr/bin/perl
#
# htmldiff - present a diff marked version of two html documents
#
# Copyright (c) 1998-2006 MACS, Inc.
#
# Copyright (c) 2007 SiSco, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# See http://www.themacs.com for more information.
#
# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]]
#
# -c - disable metahtml comment processing
# -o - disable outputting of old text
# -l - use navindex to create sequence of diffs
# oldversion - the previous version of the document
# newversion - the newer version of the document
# output - a filename to place the output in. If omitted, the output goes to
# standard output.
#
# if invoked with no options or arguments, operates as a CGI script. It then
# takes the following parameters:
#
# oldfile - the URL of the original file
# newfile - the URL of the new file
# mhtml - a flag to indicate whether it should be aware of MetaHTML comments.
#
# requires GNU diff utility
# also requires the perl modules Getopt::Std
#
# NOTE: The markup created by htmldiff may not validate against the HTML 4.0
# DTD. This is because the algorithm is realtively simple, and there are
# places in the markup content model where the span element is not allowed.
# Htmldiff is NOT aware of these places.
#
# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $
# $Revision: 1.1 $
#
# $Log: htmldiff.pl,v $
# Revision 1.1 2014/01/06 08:04:51 dom
# added copy of htmldiff perl script since aptest.com repo no longer available
#
# Revision 1.5 2008/03/05 13:23:16 ahby
# Fixed a problem with leading whitespace before markup.
#
# Revision 1.4 2007/12/13 13:09:16 ahby
# Updated copyright and license.
#
# Revision 1.3 2007/12/13 12:53:34 ahby
# Changed use of span to ins and del
#
# Revision 1.2 2002/02/13 16:27:23 ahby
# Changed processing model.
# Improved handling of old text and changed styles.
#
# Revision 1.1 2000/07/12 12:20:04 ahby
# Updated to remove empty spans - this fixes validation problems under
# strict.
#
# Revision 1.11 1999/12/08 19:46:45 ahby
# Fixed validation errors introduced by placing markup where it didn't
# belong.
#
# Revision 1.10 1999/10/18 13:42:58 ahby
# Added -o to the usage message.
#
# Revision 1.9 1999/05/04 12:29:11 ahby
# Added an option to turn off the display of old text.
#
# Revision 1.8 1999/04/09 14:37:27 ahby
# Fixed a perl syntax error.
#
# Revision 1.7 1999/04/09 14:35:49 ahby
# Added reference to MACS homepage.
#
# Revision 1.6 1999/04/09 14:35:09 ahby
# Added comment about validity of generated markup.
#
# Revision 1.5 1999/02/22 22:17:54 ahby
# Changed to use stylesheets.
# Changed to rely upon span.
# Changed to work around content model problems.
#
# Revision 1.4 1999/02/08 02:32:22 ahby
# Added a copyright statement.
#
# Revision 1.3 1999/02/08 02:30:40 ahby
# Added header processing.
#
# Revision 1.2 1998/12/10 17:31:31 ahby
# Fixed to escape less-thans in change blocks and to not permit change
# markup within specific elements (like TITLE).
#
# Revision 1.1 1998/11/26 00:09:22 ahby
# Initial revision
#
#
use Getopt::Std;
sub usage {
print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n";
exit;
}
sub url_encode {
my $str = shift;
$str =~ s/([\x00-\x1f\x7F-\xFF])/
sprintf ('%%%02x', ord ($1))/eg;
return $str;
}
# markit - diff-mark the streams
#
# markit(file1, file2)
#
# markit relies upon GNUdiff to mark up the text.
#
# The markup is encoded using special control sequences:
#
# a block wrapped in control-a is deleted text
# a block wrapped in control-b is old text
# a block wrapped in control-c is new text
#
# The main processing loop attempts to wrap the text blocks in appropriate
# SPANs based upon the type of text that it is.
#
# When the loop encounters a < in the text, it stops the span. Then it outputs
# the element that is defined, then it restarts the span.
sub markit {
my $retval = "";
my($file1) = shift;
my($file2) = shift;
# my $old="deleted text: %c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
my $unchanged="%=";
my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
if ($opt_o) {
$old = "";
$changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
}
# my $old="%c'\002'deleted text:%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\012'%c'\002'";
# my $new="%c'\002'%c'\012'%c'\002'%>%c'\002'%c'\002'%c'\012'";
# my $unchanged="%=";
# my $changed="%c'\002'%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\002'%c'\012'%>%c'\012'%c'\002'%c'\002'%c'\012'";
my @span;
$span[0]="";
$span[1]="";
$span[2]="";
$span[3]="";
$span[4]="";
my @diffEnd ;
$diffEnd[1] = '';
$diffEnd[2] = '';
$diffEnd[3] = '';
$diffEnd[4] = '';
my $diffcounter = 0;
open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!");
# system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output));
my $state = 0;
my $inblock = 0;
my $temp = "";
my $lineCount = 0;
# strategy:
#
# process the output of diff...
#
# a link with control A-D means the start/end of the corresponding ordinal
# state (1-4). Resting state is state 0.
#
# While in a state, accumulate the contents for that state. When exiting the
# state, determine if it is appropriate to emit the contents with markup or
# not (basically, if the accumulated buffer contains only empty lines or lines
# with markup, then we don't want to emit the wrappers. We don't need them.
#
# Note that if there is markup in the "old" block, that markup is silently
# removed. It isn't really that interesting, and it messes up the output
# something fierce.
while () {
my $anchor = $opt_l ? qq[] : "" ;
my $anchorEnd = $opt_l ? q[] : "" ;
$lineCount ++;
if ($state == 0) { # if we are resting and we find a marker,
# then we must be entering a block
if (m/^([\001-\004])/) {
$state = ord($1);
$_ = "";
}
# if (m/^\001/) {
# $state = 1;
# s/^/$span[1]/;
# } elsif (m/^\002/) {
# $state = 2;
# s/^/$span[2]/;
# } elsif (m/^\003/) {
# $state = 3;
# s/^/$span[3]/;
# } elsif (m/^\004/) {
# $state = 4;
# s/^/$span[4]/;
# }
} else {
# if we are in "old" state, remove markup
if (($state == 1) || ($state == 2)) {
s/\<.*\>//; # get rid of any old markup
s/\</g; # escape any remaining STAG or ETAGs
s/\>/>/g;
}
# if we found another marker, we must be exiting the state
if (m/^([\001-\004])/) {
if ($temp ne "") {
$_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n";
$temp = "";
} else {
$_ = "" ;
}
$state = 0;
} elsif (m/^\s*\) { # otherwise, is this line markup?
# if it is markup AND we haven't seen anything else yet,
# then we will emit the markup
if ($temp eq "") {
$retval .= $_;
$_ = "";
} else { # we wrap it with the state switches and hold it
s/^/$anchorEnd$diffEnd[$state]/;
s/$/$span[$state]$anchor/;
$temp .= $_;
$_ = "";
}
} else {
if (m/.+/) {
$temp .= $_;
$_ = "";
}
}
}
s/\001//g;
s/\002//g;
s/\003//g;
s/\004//g;
if ($_ !~ m/^$/) {
$retval .= $_;
}
$diffcounter++;
}
close FILE;
$retval =~ s/$span[1]\n+$diffEnd[1]//g;
$retval =~ s/$span[2]\n+$diffEnd[2]//g;
$retval =~ s/$span[3]\n+$diffEnd[3]//g;
$retval =~ s/$span[4]\n+$diffEnd[4]//g;
$retval =~ s/$span[1]\n*$//g;
$retval =~ s/$span[2]\n*$//g;
$retval =~ s/$span[3]\n*$//g;
$retval =~ s/$span[4]\n*$//g;
return $retval;
}
sub splitit {
my $filename = shift;
my $headertmp = shift;
my $inheader=0;
my $preformatted=0;
my $inelement=0;
my $retval = "";
my $styles = q(
);
if ($opt_t) {
$styles .= q(
);
}
if ($stripheader) {
open(HEADER, ">$headertmp");
}
my $incomment = 0;
my $inhead = 1;
open(FILE, $filename) || die("File $filename cannot be opened: $!");
while () {
if ($inhead == 1) {
if (m/\<\/head/i) {
print HEADER $styles;
}
if (m/\
);
}
close HEADER;
} else {
print HEADER;
}
} else {
if ($incomment) {
if (m;-->;) {
$incomment = 0;
s/.*-->//;
} else {
next;
}
}
if (m;;) {
s///;
}
if (m;