From d9013cab5fb51d341f96e325b15e207859e4c313 Mon Sep 17 00:00:00 2001 From: Daniel Wagner-Hall Date: Mon, 14 Sep 2015 11:03:54 +0100 Subject: [PATCH] speculator: Add HTML diffing I started fiddling with re-implementing the perl script in Go to add some new functionality (and avoid the Perl), but it's not yet usable --- scripts/speculator/README | 1 + scripts/speculator/htmldiff.pl | 564 +++++++++++++++++++++++++++++++++ scripts/speculator/main.go | 130 ++++++-- 3 files changed, 661 insertions(+), 34 deletions(-) create mode 100755 scripts/speculator/htmldiff.pl diff --git a/scripts/speculator/README b/scripts/speculator/README index 6ab84f68..0a9f53fd 100644 --- a/scripts/speculator/README +++ b/scripts/speculator/README @@ -4,6 +4,7 @@ It serves the following HTTP endpoints: - / lists open pull requests - /spec/123 which renders the spec as html at pull request 123. - /diff/rst/123 which gives a diff of the spec's rst at pull request 123. + - /diff/html/123 which gives a diff of the spec's HTML at pull request 123. To run it, you must install the `go` tool, and run: `go run main.go` diff --git a/scripts/speculator/htmldiff.pl b/scripts/speculator/htmldiff.pl new file mode 100755 index 00000000..b1689067 --- /dev/null +++ b/scripts/speculator/htmldiff.pl @@ -0,0 +1,564 @@ +#!/usr/bin/perl +# +# htmldiff - present a diff marked version of two html documents +# +# Copyright (c) 1998-2006 MACS, Inc. +# +# Copyright (c) 2007 SiSco, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# See http://www.themacs.com for more information. +# +# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]] +# +# -c - disable metahtml comment processing +# -o - disable outputting of old text +# -l - use navindex to create sequence of diffs +# oldversion - the previous version of the document +# newversion - the newer version of the document +# output - a filename to place the output in. If omitted, the output goes to +# standard output. +# +# if invoked with no options or arguments, operates as a CGI script. It then +# takes the following parameters: +# +# oldfile - the URL of the original file +# newfile - the URL of the new file +# mhtml - a flag to indicate whether it should be aware of MetaHTML comments. +# +# requires GNU diff utility +# also requires the perl modules Getopt::Std +# +# NOTE: The markup created by htmldiff may not validate against the HTML 4.0 +# DTD. This is because the algorithm is realtively simple, and there are +# places in the markup content model where the span element is not allowed. +# Htmldiff is NOT aware of these places. +# +# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $ +# $Revision: 1.1 $ +# +# $Log: htmldiff.pl,v $ +# Revision 1.1 2014/01/06 08:04:51 dom +# added copy of htmldiff perl script since aptest.com repo no longer available +# +# Revision 1.5 2008/03/05 13:23:16 ahby +# Fixed a problem with leading whitespace before markup. +# +# Revision 1.4 2007/12/13 13:09:16 ahby +# Updated copyright and license. +# +# Revision 1.3 2007/12/13 12:53:34 ahby +# Changed use of span to ins and del +# +# Revision 1.2 2002/02/13 16:27:23 ahby +# Changed processing model. +# Improved handling of old text and changed styles. +# +# Revision 1.1 2000/07/12 12:20:04 ahby +# Updated to remove empty spans - this fixes validation problems under +# strict. +# +# Revision 1.11 1999/12/08 19:46:45 ahby +# Fixed validation errors introduced by placing markup where it didn't +# belong. +# +# Revision 1.10 1999/10/18 13:42:58 ahby +# Added -o to the usage message. +# +# Revision 1.9 1999/05/04 12:29:11 ahby +# Added an option to turn off the display of old text. +# +# Revision 1.8 1999/04/09 14:37:27 ahby +# Fixed a perl syntax error. +# +# Revision 1.7 1999/04/09 14:35:49 ahby +# Added reference to MACS homepage. +# +# Revision 1.6 1999/04/09 14:35:09 ahby +# Added comment about validity of generated markup. +# +# Revision 1.5 1999/02/22 22:17:54 ahby +# Changed to use stylesheets. +# Changed to rely upon span. +# Changed to work around content model problems. +# +# Revision 1.4 1999/02/08 02:32:22 ahby +# Added a copyright statement. +# +# Revision 1.3 1999/02/08 02:30:40 ahby +# Added header processing. +# +# Revision 1.2 1998/12/10 17:31:31 ahby +# Fixed to escape less-thans in change blocks and to not permit change +# markup within specific elements (like TITLE). +# +# Revision 1.1 1998/11/26 00:09:22 ahby +# Initial revision +# +# + +use Getopt::Std; + +sub usage { + print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n"; + exit; +} + +sub url_encode { + my $str = shift; + $str =~ s/([\x00-\x1f\x7F-\xFF])/ + sprintf ('%%%02x', ord ($1))/eg; + return $str; +} + +# markit - diff-mark the streams +# +# markit(file1, file2) +# +# markit relies upon GNUdiff to mark up the text. +# +# The markup is encoded using special control sequences: +# +# a block wrapped in control-a is deleted text +# a block wrapped in control-b is old text +# a block wrapped in control-c is new text +# +# The main processing loop attempts to wrap the text blocks in appropriate +# SPANs based upon the type of text that it is. +# +# When the loop encounters a < in the text, it stops the span. Then it outputs +# the element that is defined, then it restarts the span. + +sub markit { + my $retval = ""; + my($file1) = shift; + my($file2) = shift; +# my $old="deleted text: %c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; + my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; + my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'"; + my $unchanged="%="; + my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; + if ($opt_o) { + $old = ""; + $changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; + } +# my $old="%c'\002'deleted text:%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\012'%c'\002'"; +# my $new="%c'\002'%c'\012'%c'\002'%>%c'\002'%c'\002'%c'\012'"; +# my $unchanged="%="; +# my $changed="%c'\002'%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\002'%c'\012'%>%c'\012'%c'\002'%c'\002'%c'\012'"; + + my @span; + $span[0]=""; + $span[1]=""; + $span[2]=""; + $span[3]=""; + $span[4]=""; + + my @diffEnd ; + $diffEnd[1] = ''; + $diffEnd[2] = ''; + $diffEnd[3] = ''; + $diffEnd[4] = ''; + + my $diffcounter = 0; + + open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!"); +# system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output)); + + my $state = 0; + my $inblock = 0; + my $temp = ""; + my $lineCount = 0; + +# strategy: +# +# process the output of diff... +# +# a link with control A-D means the start/end of the corresponding ordinal +# state (1-4). Resting state is state 0. +# +# While in a state, accumulate the contents for that state. When exiting the +# state, determine if it is appropriate to emit the contents with markup or +# not (basically, if the accumulated buffer contains only empty lines or lines +# with markup, then we don't want to emit the wrappers. We don't need them. +# +# Note that if there is markup in the "old" block, that markup is silently +# removed. It isn't really that interesting, and it messes up the output +# something fierce. + + while () { + my $anchor = $opt_l ? qq[] : "" ; + my $anchorEnd = $opt_l ? q[] : "" ; + $lineCount ++; + if ($state == 0) { # if we are resting and we find a marker, + # then we must be entering a block + if (m/^([\001-\004])/) { + $state = ord($1); + $_ = ""; + } +# if (m/^\001/) { +# $state = 1; +# s/^/$span[1]/; +# } elsif (m/^\002/) { +# $state = 2; +# s/^/$span[2]/; +# } elsif (m/^\003/) { +# $state = 3; +# s/^/$span[3]/; +# } elsif (m/^\004/) { +# $state = 4; +# s/^/$span[4]/; +# } + } else { + # if we are in "old" state, remove markup + if (($state == 1) || ($state == 2)) { + s/\<.*\>//; # get rid of any old markup + s/\/>/g; + } + # if we found another marker, we must be exiting the state + if (m/^([\001-\004])/) { + if ($temp ne "") { + $_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n"; + $temp = ""; + } else { + $_ = "" ; + } + $state = 0; + } elsif (m/^\s*\ +.diff-old-a { + font-size: smaller; + color: red; +} + +.diff-new { background-color: yellow; } +.diff-chg { background-color: lime; } +.diff-new:before, +.diff-new:after + { content: "\2191" } +.diff-chg:before, .diff-chg:after + { content: "\2195" } +.diff-old { text-decoration: line-through; background-color: #FBB; } +.diff-old:before, +.diff-old:after + { content: "\2193" } +:focus { border: thin red solid} + +); + if ($opt_t) { + $styles .= q( + +); + + } + + if ($stripheader) { + open(HEADER, ">$headertmp"); + } + + my $incomment = 0; + my $inhead = 1; + open(FILE, $filename) || die("File $filename cannot be opened: $!"); + while () { + if ($inhead == 1) { + if (m/\<\/head/i) { + print HEADER $styles; + } + if (m/\ +); + } + close HEADER; + } else { + print HEADER; + } + } else { + if ($incomment) { + if (m;-->;) { + $incomment = 0; + s/.*-->//; + } else { + next; + } + } + if (m;;) { + s///; + } + if (m;