#!/usr/bin/perl

# Convert an ej-formatted doc in to a simple docbook document
# Input: First argument or standard input
# Output: Standard output

$TMP = $ENV{'HOME'} . "/tmp";
# Make this just /tmp at your own risk.  You have been warned.

if(! -d $TMP ) {
    die "Fatal: Please create a directory entitled " . $TMP . "\n";
    }

# Read in the doc

while(<>){$doc .= $_}

$* = 1; # Match multiple lines

# Get rid of <!-- ... --> comments
$doc =~ s|<\!\-\-.*?\-\->||sg;

# body flags
$bodyflags = "";

# Grab the header
if($doc =~ m|<head>(.*?)</head>|is) {
    $header = $1;
    # Get rid of the TH; HTML doesn't use it
    $header =~ s|<th>.*?</th>||igs;
    # Get rid of DTWIDTH too
    $header =~ s|<dtwidth>.*?</dtwidth>||igs;
    # Parse the body flags
    if($header =~ s|<bodyflags>(.*?)</bodyflags>||igs) {
        $bodyflags = $1;
	}
    }
else {
    die "Fatal: Document must have a heading section\n";
    }

# Make sure the header has 
# <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=XXX">
# Where XXX is any character set
if($header !~ 
m|meta\s+http\-equiv\=\"content\-type\"\s+content\=\"text\/html\;\s+charset=|i) 
  {
  print "Please have somthing like this:\n";
  print
    '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; CHARSET=utf-8">';
  print "\n";
  die "Fatal: Header must declare charset\n";
  }

# Get rid of charset delaration in header; docbook can't handle those
# kinds of things
$header =~ s|<meta[^>]*>||igs;

# OK, the header looks kosher.  Start generating docbook

print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
print '"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">' . "\n";
print '<!-- Do *not* edit this file; it was automatically generated by';
print " ej2dbk\n";
print '     Look for a name.ej file with the same name as this filename -->';
print "\n<!-- Disclaimer: This is poor docbook which does not correctly\n";
print "close tags -->\n";
# Timestamp
$ts = localtime(time());
print '<!-- Last updated ' . $ts . " -->\n";
print "\n";
print "<article>\n<articleinfo>\n";
print $header;
print "</articleinfo>\n";

# Enough of header processing; let's get to the body of the document

# Grab the body
if($doc =~ m|<body>(.*?)</body>|is) {
    $body = $1;
    }
else {
    die "Fatal: Document must have a body section\n";
    }

$body = process_body($body);
#$body = fmt($body);

print($body);
print "</article>\n";

exit(0);

# And this processes the body (we do this way so we can recursively handle 
# those pesky PRE flags)
sub process_body {
   my($body) = @_;
   my($hack,$filename);

   # The INCLUDE tag
   while($body =~ m|\<include\s+\"([^"]+)\"\s*\>|is) {
       $filename = $1;
       open(FILE,"< $filename") || die "Can not find file $filename\n";
       $hack = "";
       while(<FILE>) {$hack .= $_}
       close(FILE);
       #$hack = process_body($hack);
       $body =~ s|\<include\s+\"([^"]+)\"\s*\>|$hack|is;
       }

   # Convert all of the other EJ tags in to docbook equivalents

   # P: becomes para
   $body =~ s|<p>|</para>\n<para>|igs;
   # Get rid of the first </para>
   $body =~ s|</para>||;

   # UL/LI: Becomes itemizedlist
   $body =~ s|<ul>|<itemizedlist>|igs;
   $body =~ s|<li>|</para><listitem><para>|igs;
   $body =~ s|</ul>|</itemizedlist>|igs;

   # I: Becomes emphasis
   $body =~ s|<i>|<emphasis>|igs;
   $body =~ s|</i>|</emphasis>|igs;

   # B: Becomes emphasis role=bold
   $body =~ s|<b>|<emphasis role=bold>|igs;
   $body =~ s|</b>|</emphasis>|igs;

   # H1: Becomes title for a "sect1"
   $body =~ s|<h1>|<sect1><title>|igs;
   $body =~ s|</h1>|</title><para>|igs;

   # H2: Becomes title for a "sect2"
   $body =~ s|<h1>|<sect2><title>|igs;
   $body =~ s|</h1>|</title><para>|igs;

   # PRE: Becomes "literallayout" (which stays porportional)
   $body =~ s|<pre>|<literallayout>|igs;
   $body =~ s|</pre>|</literallayout>|igs;

   # BR (docbook doesn't really have this, so we have to hack)
   $body =~ s|<br>|</para><para>|igs;

   # BLOCKQUOTE 
   $body =~ s|<blockquote>|<blockquote><para>|igs;
   $body =~ s|</blockquote>|</para></blockquote>|igs;

   # TT (removed)
   $body =~ s|</?tt>||igs;

   # DL, DT, and DD
   $body =~ s|<dl>|<variablelist>|igs;
   $body =~ s|</dl>|</variablelist>|igs;
   $body =~ s|<dt>|<varlistentry><term>|igs;
   $body =~ s|<dd>|</term><listitem><para>|igs;

   # TABLE, TD, and TR
   $body =~ s|<table>|<programlisting>|igs;
   $body =~ s|<td>|\t|igs;
   $body =~ s|<tr>||igs;

   return $body;

   }

# This takes a string, and braks any lines longer than 75 columns; otherwise
# it performs no other formatting
# Input: The string to format
# Output: The formatted string

sub fmt {
   my($input) = @_;
   my($place,$lastspace,$column,$linebegin);
  
   $place = $lastspace = $column = $linebegin = 0;

   # Get rid of trailing white space, which confuses this algorithm
   $input =~ s/[ \t]+\n/\n/sg;

   # The core algorithm
   while($place < length($input)) {
       if(substr($input,$place,1) =~ /[ \t]/) {
           $lastspace = $place;
	   }
       if(substr($input,$place,1) =~ /\n/) {
           $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       if($column > 70 && $linebegin != $lastspace) {
           substr($input,$lastspace,1,"\n");
	   $place = $lastspace;
	   $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       $column++;
       $place++;
       }

   $input;
   } 

