###
# NAME:         esis2tree
#               Copyright (c) 1996 Liam Relihan. 
#
# PURPOSE:      
#
# AUTHOR:       Liam Relihan
#               <liam@homer.njit.edu>
# SCCS:         %M%        %R%.%L% %D%
# DATE:         5-Mar-1996
# PARAMETERS:   
# CALLED BY:
# MODIFIED:     15-mar-1996     LOR    Changed .data output from packed 
#                                      binary to ASCII
#               18-Mar-1996     LOR    Cleaned up DATA file label formatting
#               19-Mar-1996     LOR    Fixed serious bug in signature 
#                                      calculation. Was using ^ instead of **
#               20-Mar-1996     LOR    Added -r option
#               15-Apr-1996     LOR    Added propagated signatures
#               18-Apr-1996     LOR    Added more comments
#
# NOTES:        
#               Uses multi-dimensional arrays so this runs on perl5 only.
#               Use nsgmls with the -oline option when piping to this script.
#
#               When producing labels for elements with attributes,
#               note that this script does not necessarily preserve the 
#               order in which they were declared in the document source 
#               file. In addition, attributes with default values are 
#               explicitely declared in the label.
#
# BUGS:         Does not appear to work with some compiles of perl5 for
#               Solaris (version 5.001). This appears to be a fault of perl
#               rather than of this script. Solution is to get new compiled
#               version of perl5.
#
#               This script is very complex. While it appears to work OK,
#               it might need to be restructured at some later date. 
##
require "getopts.pl";

&Getopts('hr');

if ($opt_h) {
    print <<"EOF";
usage: $0 [-h] [-r] [<stem>] < ESIS_STREAM
     <stem>        the stem for the two data files that will be generated
     -r            relax; the document does not need to be conforming for 
                   data files to be produced. Use with caution
     -h            this help
     ESIS_STREAM   ESIS stream as produced by nsgmls or similar (ensure
                   that the stream contains line number information)

EOF
    exit;
}

$output_file_stem=$ARGV[0]||'tree';

$preorder_count=0;

# setup some prime numbers

$word_prime = 3989;
$line_prime = 57593;
$para_prime = 99983;
$propagate_prime = 13;

$label_length=18;

$lineno=0; #initialize lineno just in case nsgmls was not called with -oline

# open the output files
open(TREE,">$output_file_stem.tree");
open(DATA,">$output_file_stem.data");


### In general, the following code is not user servicable:
############## Begin Factory servicable Code only ##############
$backslash_in_data = "\\e";

$prog = $0;

$prog =~ s|.*/||;

$level = 0;

while (<STDIN>) {
    chop;
    $command = substr($_, 0, 1);
    substr($_, 0, 1) = "";
    if ($command eq '(') {
	&start_element($_);
	$level++;
    }
    elsif ($command eq ')') {
	$level--;
	&end_element($_);
	foreach $key (keys %attribute_value) {
	    @splitkey = split($;, $key);
	    if ($splitkey[0] == $level) {
		delete $attribute_value{$key};
		delete $attribute_type{$key};
	    }
	}
    }
    elsif ($command eq '-') {
# commented out by Liam Relihan to allow embedded newlines,etc
	&unescape_data($_);
	&data($_);
    }
    elsif ($command eq 'A') {
	@field = split(/ /, $_, 3);
	$attribute_type{$level,$field[0]} = $field[1];
	&unescape_data($field[2]);
	$attribute_value{$level,$field[0]} = $field[2];
    }
    elsif ($command eq '&') {
	&entity($_);
    }
    elsif ($command eq 'D') {
	@field = split(/ /, $_, 4);
	$data_attribute_type{$field[0], $field[1]} = $field[2];
	&unescape_data($field[3]);
	$data_attribute_value{$field[0], $field[1]} = $field[3];
    }
    elsif ($command eq 'N') {
	$notation{$_} = 1;
	if (defined($sysid)) {
	    $notation_sysid{$_} = $sysid;
	    undef($sysid);
	}
	if (defined($pubid)) {
	    $notation_pubid{$_} = $pubid;
	    undef($pubid);
	}
    }
    elsif ($command eq 'I') {
        @field = split(/ /, $_, 3);
	$entity_type{$field[0]} = $field[1];
	&unescape($field[2]);
	# You may want to substitute \e for \ if the type is CDATA.
	$entity_text{$field[0]} = $field[2];
	$entity_code{$field[0]} = 'I';
    }
    elsif ($command eq 'E') {
	@field = split(/ /, $_);
	$entity_code{$field[0]} = 'E';
	$entity_type{$field[0]} = $field[1];
	$entity_notation{$field[0]} = $field[2];
	if (defined(@files)) {
	    foreach $i (0..$#files) {
		$entity_filename{$field[0], $i} = $files[$i];
	    }
	    undef(@files);
	}
	if (defined($sysid)) {
	    $entity_sysid{$field[0]} = $sysid;
	    undef($sysid);
	}
	if (defined($pubid)) {
	    $entity_pubid{$field[0]} = $pubid;
	    undef($pubid);
	}
    }
    elsif ($command eq 'S') {
	$entity_code{$_} = 'S';
	if (defined(@files)) {
	    foreach $i (0..$#files) {
		$entity_filename{$_, $i} = $files[$i];
	    }
	    undef(@files);
	}
	if (defined($sysid)) {
	    $entity_sysid{$_} = $sysid;
	    undef($sysid);
	}
	if (defined($pubid)) {
	    $entity_pubid{$_} = $pubid;
	    undef($pubid);
	}
    }
    elsif ($command eq '?') {
	&unescape($_);
	&pi($_);
    }
    elsif ($command eq 'L') {
	@field = split(/ /, $_);
	$lineno = $field[0];
	if ($#field >= 1) {
	    &unescape($field[1]);
	    $filename = $field[1];
	}
    }
    elsif ($command eq 'V') {
	@field = split(/ /, $_, 2);
	&unescape($field[1]);
	$environment{$field[0]} = $field[1];
    }
    elsif ($command eq '{') {
	&start_subdoc($_);
    }
    elsif ($command eq '}') {
	&end_subdoc($_);
    }
    elsif ($command eq 'f') {
	&unescape($_);
	push(@files, $_);
    }
    elsif ($command eq 'p') {
	&unescape($_);
	$pubid = $_;
    }
    elsif ($command eq 's') {
	&unescape($_);
	$sysid = $_;
    }
    elsif ($command eq 'C') {
	$conforming = 1;
	&end_doc;
    }
    else {
	warn "$prog:$ARGV:$.: ($lineno) unrecognized command \`$command'\n";
    }
}

if ($opt_r) {
    &record_data(@element_details_list);
    close(TREE);
    close(DATA);
    print "Relaxed Mode (-r option set):\n";
    print "This document was non-conforming.\n";
    print "Output is in \"$output_file_stem.tree\" and ";
    print "\"$output_file_stem.data\".\n";
    exit 0;
} 

close(TREE);
close(DATA);
unlink "$output_file_stem.tree","$output_file_stem.data";
print "It appears that there have been problems parsing your document.\n";
print "The datafiles have been deleted.\n";
exit -1;

#################### beginning of subroutines ####################


sub unescape {
    $_[0] =~ s/\\([0-7][0-7]?[0-7]?|.)/&esc($1)/eg;
}

sub esc {
    local($_) = $_[0];
    if ($_ eq '012' || $_ eq '12') {
	"";			# ignore RS
    }
    elsif (/^[0-7]/) {
	sprintf("%c", oct);
    }
    elsif ($_ eq 'n') {
	"\n";
    }
    elsif ($_ eq '|') {
	"";
    }
    elsif ($_ eq "\\") {
	"\\";
    }
    else {
	$_;
    }
}

sub unescape_data {
    local($sdata) = 0;
    $_[0] =~ s/\\([0-7][0-7]?[0-7]?|.)/&esc_data($1)/eg;
}

sub esc_data {
    local($_) = $_[0];
    if ($_ eq '012' || $_ eq '12') {
	"";			# ignore RS
    }
    elsif (/^[0-7]/) {
	sprintf("%c", oct);
    }
    elsif ($_ eq 'n') {
	"\n";
    }
    elsif ($_ eq '|') {
	$sdata = !$sdata;
	"";
    }
    elsif ($_ eq "\\") {
	$sdata ? "\\" : $backslash_in_data;
    }
    else {
	$_;
    }
}

############## End Factory servicable Code only ##############

# the following is invoked upon the start of an element
sub start_element {
    local($gi) = $_[0];
    local(@rec,$sig,$attrib_sig_string,$sig_data,$wordcnt,$attrib_encoding);
    
    print TREE "($gi";

    # setup the attributes and get their signature
    foreach $attrib (keys %attribute_value) {
 	if (($attrib_name)=($attrib=~/^$level(.*)$/)) {
	    if ($attribute_type{$attrib} eq "CDATA") {
		$attrib_encoding.=
		    " $attrib_name=\"$attribute_value{$attrib}\"";
	    }elsif ($attribute_type{$attrib} eq "TOKEN") {
		$attrib_encoding.=" $attribute_value{$attrib}";
	    }
	    $attrib_sig_string.="$attribute_value{$attrib}\n";
 	}			 
    }
    $sig=&signature($attrib_sig_string); 

    #      preorder,line_no,end_line_no,signature,p_signature,num_children,
    #      nword,str,label

    @rec=($preorder_count++,
	  $lineno,
	  0,
	  $sig,
	  $sig,
	  0,
	  0,
	  $gi,
	  substr("<$gi$attrib_encoding>",0,$label_length));

    push(@element_stack,[@rec]);
}

# this is invoked at the end of an element
sub end_element {
    local($gi) = $_[0];
    local($order_count,$start_lineno,$end_lineno,$sig,
	  $nword,$unit_name,$label);
    local(@stk_vals);
    print TREE ")";
    #      preorder,line_no,end_line_no,signature,nword,str,label
    $stk_vals=pop(@element_stack);
    $stk_vals->[2]=$lineno;
    push(@element_details_list,[@$stk_vals]);
    if ($#element_stack>=0) {
	&add_to_sig($stk_vals->[4],$#element_stack);
    }
}


# this is invoked when PCDATA is detected
sub data {
    local($data,@datadetails) = $_[0];
    local($label,$numlines);

    $numlines= $data=~ tr/\n/\n/; #find the number of lines
    
    print TREE "(DATA)";
    #      preorder,line_no,end_line_no,signature,nword,str,label

    $label=$data;
    $label=~s/\n/ /g; # replace newlines with spaces
    $label=~s/^[ \t\n]*//; # get rid of leading whitespace 
    $label=substr($label,0,$label_length);

    $sig=&signature($data);

    &add_to_sig($sig,$#element_stack);

    @datadetails=($preorder_count++,
		  $lineno,
		  $lineno+$numlines,
		  $sig,
		  $sig,
		  0,
		  &wordcount($data),
		  "data",
		  $label);

    push(@element_details_list,[@datadetails]);
}


# add a signature to another signature
# multiply by the number of children
sub add_to_sig {
    local($sig,$lev) = @_;

    $element_stack[$lev]->[5]++; #inc children num
    $element_stack[$lev]->[4]=$element_stack[$lev]->[4]+
	($sig*$element_stack[$lev]->[5]);
}

# invoke this at the end of a conforming document
sub end_doc {
    local($name) = $_[0];

    &record_data(@element_details_list);

    print "Document is conforming.\n";
    print "Output is in \"$output_file_stem.tree\" and ";
    print "\"$output_file_stem.data\".\n";

    close(TREE);
    close(DATA);

    exit 0;
}

# write the data to the data file
sub record_data{
    local(@datalist)=@_;
    local($order_count, $start_lineno, $end_lineno, $sig, $p_sig,
	  $num_children, $nword, $unit_name, $label);
    
    foreach $rec (sort {$a->[0] <=> $b->[0]}  @datalist) {

	($order_count,
	 $start_lineno,
	 $end_lineno,
	 $sig,
	 $p_sig,
	 $num_children,
	 $nword,
	 $unit_name,
	 $label)=@$rec;

	print DATA $start_lineno,"\n",	
	$end_lineno,"\n",
	$sig,"\n",
	$p_sig,"\n",
	$nword,"\n",
	$unit_name,"\n",
	$label,"\n",
	"***\n";		
    }
}


# calculate signature for a piece of text
sub signature {
    local($textstr) = @_;
    local($line_counter,@lines,$letter,$num_letters);
    local($para_hash,$line_hash,$word_hash,$word);

    $para_hash=0;
    @lines=split(/\n/,$textstr);
    for $line_counter (0..$#lines) {
	@words=split(/[ \n,;\.:\"]+/,$lines[$line_counter]);
	$line_hash=0;
	for $word_counter (0..$#words){
	    $word_hash=0;
	    $word=$words[$word_counter];
	    $num_letters=length($word);
	    foreach $letter (1..$num_letters) {
		$word_hash += (ord(substr($word,$letter-1,1)) * 
			       2**($num_letters-$letter));

	    }
	    $word_hash = $word_hash % $word_prime;
	    $line_hash += $word_hash * 2**($#words-$word_counter);
	}		
	$line_hash = $line_hash % $line_prime;
	$para_hash+= $line_hash * 2**($#lines-$line_counter);
    }
    $para_hash=$para_hash % $para_prime;

    return $para_hash;
}

# count the number of words in a string
sub wordcount {
    local($text)=@_;
    local(@words);

    @words=split(/[ \n]+/,$text);
    return $#words+1;
}

############## Redundant Code ##############
####### leave intact for the moment ########

# A processing instruction.

sub pi {
    local($data) = $_[0];
    # XXX
}

# A reference to an external entity.
sub entity {
    local($name) = $_[0];
    # XXX
}

sub start_subdoc {
    local($name) = $_[0];
    # XXX
}

sub end_subdoc {
    local($name) = $_[0];
    # XXX
}


