#!/bin/env perl

=pod

=head1 NAME

parse-category - parse categorial data into a Circos plot

=head1 SYNOPSIS

  cat data.txt | parse-category
  parse-category -input data.txt

=head1 DESCRIPTION

This script formats data into a plot that encodes the relationship
between two categorical variables, such as salary and level of
training.

Each category within the variable is assigned to a segment on the
circle.

Ribbons represent the number of data points (e.g. indidividuals) for
which a given variable pair has been observed. Ribbon format can be
coded by the value of a specific column. Radial position of the ribbon
can be coded by the value of a specific column.

=head1 CONFIGURATION

All inputs are controlled through the configuration file.

=head1 HISTORY

=over

=item * 21 Apr 2014 v0.11

Updated documentation.

=item * 16 Feb 2009 v0.1

Expanded and versiond.

=back 

=head1 BUGS

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

  Martin Krzywinski
  Genome Sciences Centre
  Vancouver BC Canada
  www.bcgsc.ca
  martink@bcgsc.ca

=cut

use strict;
use Config::General;
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use IO::File;
use List::Util;
use List::MoreUtils qw(uniq);
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Set::IntSpan;
use Statistics::Descriptive;
use Storable;
use Time::HiRes qw(gettimeofday tv_interval);
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";
use vars qw(%OPT %CONF);

################################################################
#
# *** YOUR MODULE IMPORTS HERE
#
################################################################

GetOptions(\%OPT,
					 "input=s",
					 "configfile=s",
					 "help",
					 "man",
					 "debug+");

pod2usage() if $OPT{help};
pod2usage(-verbose=>2) if $OPT{man};
loadconfiguration($OPT{configfile});
populateconfiguration(); # copy command line options to config hash
validateconfiguration(); 
if($CONF{debug} > 1) {
  $Data::Dumper::Pad = "debug parameters";
  $Data::Dumper::Indent = 1;
  $Data::Dumper::Quotekeys = 0;
  $Data::Dumper::Terse = 1;
  print Dumper(\%CONF);
}

my $inputhandle = get_handle();

my @links;
while(<>) {

  chomp;
	next if /^\s*\#/;

  my @tok = split(/\t/,$_);

  my $link_start = $tok[$CONF{link_start}{col}];
  my $link_end   = $tok[$CONF{link_end}{col}];

  next unless $link_start =~ /$CONF{link_start}{rx}/;
  next unless $link_end   =~ /$CONF{link_end}{rx}/;

  next unless pass_min_max($link_start,$CONF{link_start}{min},$CONF{link_start}{max});
  next unless pass_min_max($link_end,$CONF{link_end}{min},$CONF{link_end}{max});

  my $link_format = {%{$CONF{link_default}}};
  my $coding_code;
  for my $coding_name (sort {$CONF{link_coding}{$a}{order} <=> $CONF{link_coding}{$b}{order}} keys %{$CONF{link_coding}}) {
    my $coding_branch = $CONF{link_coding}{$coding_name};
    next unless $coding_branch->{use};
    if(! defined $tok[$coding_branch->{col}]) {
      if($coding_branch->{values}{missing}) {
				for my $var_datum (split(",",$coding_branch->{values}{missing})) {
					my ($var,$value) = split("=",$var_datum);
					$link_format->{$var} = $value;
				}
      }
    } else {
      for my $value (keys %{$coding_branch->{values}}) {
				if($tok[$coding_branch->{col}] eq $value) {
					my $var_data = $coding_branch->{values}{$value};
					$coding_code .= $value;
					for my $var_datum (split(",",$var_data)) {
						my ($var,$value) = split("=",$var_datum);
						$link_format->{$var} = $value;
					}
				}
      }
    }
  }
  my $link = {id1=>$CONF{link_start}{id},
							pos1=>$link_start,
							id2=>$CONF{link_end}{id},
							pos2=>$link_end,
							code=>$coding_code,
							format=>$link_format};
  # assign a radius to the ends of the link, if requested
  for my $type (qw(start end)) {
    if($CONF{"link_".$type}{radius}{use}) {
      my $value = $tok[ $CONF{"link_".$type}{radius}{col}];
      if($CONF{"link_".$type}{radius}{remap}) {
				my $remap_value = $CONF{"link_".$type}{radius}{remap};
				$remap_value =~ s/x/$value/g;
				$value = eval $remap_value;
      }
      my $leaf = $CONF{"link_".$type}{radius};
      my $r = $leaf->{rmin} + ($leaf->{rmax}-$leaf->{rmin})*($value-$leaf->{min})/($leaf->{max}-$leaf->{min});
      $link->{ $type eq "start" ? "radius1" : "radius2" } = $r;
    }
  }
  # idX  - name of the category for the start/end of the link
  # poSX - value of the variable in the category (this determines which segment the link is on)
  push @links, $link;
}

# calculate statistics for links associated with each category/value on both ideograms
# - number of links out/to a given category/value (for each different format)
# - number of links between a given combination of category/values (for each different format)

my $stats;
for my $link (@links) {
  my $format = make_format_string( $link->{format} );
  $stats->{ from }{ $link->{id1} }{ $link->{pos1} }{ n }{ $format }++;
  $stats->{  to  }{ $link->{id2} }{ $link->{pos2} }{ n }{ $format }++;
  $stats->{ from }{ $link->{id1} }{ $link->{pos1} }{ to }{ $link->{id2} }{ $link->{pos2} }{ $format }++;
}

# Each link will contribute to a ribbon. Here, assign links to ribbons, creating ribbons
# where necessary. A new ribbon is created for a new combination of
# - category/value
# - radial position
# - format (color, thickness, etc)

my @ribbons = ();
for my $link (@links) {
  my $ribbon;
  if(@ribbons) {
    ($ribbon) = grep(
										 $_->{start}{id}     eq $link->{id1}     &&
										 $_->{start}{pos}    eq $link->{pos1}    &&
										 $_->{start}{radius} eq $link->{radius1} &&
										 $_->{end}{id}       eq $link->{id2}     &&
										 $_->{end}{pos}      eq $link->{pos2}    &&
										 $_->{end}{radius}   eq $link->{radius2} &&
										 make_format_string($_->{format}) eq make_format_string($link->{format}), @ribbons);
  }
  if(! $ribbon) {
    $ribbon = { start=>{id=>$link->{id1},pos=>$link->{pos1},radius=>$link->{radius1}},
								end=>{id=>$link->{id2},pos=>$link->{pos2},radius=>$link->{radius2}},
								format=>$link->{format},
								size=>1,
								code=>$link->{code} };
    push @ribbons, $ribbon;
  } else {
    $ribbon->{size}++;
  }
}

# Determine the position of the ribbons within each category/value segment. This is done based
# on the values in the <link_order> block.

for my $type ($CONF{link_order}{start} eq "otherend" ? qw(end start) : qw(start end)) {
  for my $id (uniq( map {$_->{$type}{id}} @ribbons)) {
    for my $pos (uniq( map {$_->{$type}{pos}} grep($_->{$type}{id} eq $id, @ribbons))) {
      my @r = grep($_->{$type}{id} eq $id && $_->{$type}{pos} eq $pos, @ribbons);
      if($CONF{link_order}{$type} eq "size") {
				@r = sort { $b->{size} <=> $a->{size} } @r;
      } elsif ($CONF{link_order}{$type} eq "coding") {
				@r = sort { ($a->{code} cmp $b->{code}) || ($b->{size} <=> $a->{size}) } @r;
      } elsif ($CONF{link_order}{$type} eq "radius") {
				@r = sort { $a->{$type}{radius} <=> $b->{$type}{radius} } @r;
      } elsif ($CONF{link_order}{$type} eq "otherend") {
				my $otype = $type eq "start" ? "end" : "start";
				@r = sort { ($a->{$otype}{pos} <=> $b->{$otype}{pos}) || ($a->{$otype}{u} <=> $b->{$otype}{v}) } @r;
      }
      # determine the start/end positions (u/v) of the ribbon on the category/value segment
      my $cumul_pos = 0;
      for my $r (@r) {
				#printinfo($type,$id,$pos,$r->{start}{radius});
				$r->{$type}{u} = $cumul_pos;
				$r->{$type}{v} = $cumul_pos + $r->{size};
				$cumul_pos += $r->{size};
      }
    }
  }
}

my $linkid=0;
for my $ribbon (@ribbons) {
  my @format;
  push @format, sprintf("%s",make_format_string($ribbon->{format}));
  push @format, sprintf("radius1=%.3fr",$ribbon->{start}{radius}) if defined $ribbon->{start}{radius};
  push @format, sprintf("radius2=%.3fr",$ribbon->{end}{radius}) if defined $ribbon->{end}{radius};
  push @format, sprintf("z=%d",$ribbon->{size});
  printinfo(sprintf("link%d %s-%d %d %d %s",
										$linkid,
										@{$ribbon->{start}}{qw(id pos u v)},
										join(",",@format)));
  printinfo(sprintf("link%d %s-%d %d %d %s",
										$linkid,
										@{$ribbon->{end}}{qw(id pos u v)},
										join(",",@format)));
  $linkid++;
}

for my $end (qw(start end)) {
  my $dir = $end eq "start" ? "from" : "to";
  for my $id (sort keys %{$stats->{$dir}}) {
    for my $pos (sort {$end eq "start" ? $a <=> $b : $b <=> $a} keys %{$stats->{$dir}{$id}}) {
      my $size = sum( values %{$stats->{$dir}{$id}{$pos}{n}} );
      my $idname  = sprintf("%s-%s",$id,$pos);
      my $idlabel = sprintf("%s-%s",$id,$pos);
      my $idcolor = sprintf("%s_a%d",$CONF{"link_".$end}{color}, ($pos%10)||1 );
      print STDERR sprintf("chr - %s %s 0 %d %s\n",
													 $idname,$idlabel,
													 $size,
													 $idcolor);
    }
  }
}

sub pass_min_max {
  my ($value,$min,$max) = @_;
  my $pass = 1;
  $pass = 0 if defined $min && $value < $min;
  $pass = 0 if defined $max && $value > $max;
  return $pass;
}

sub make_format_string {
  my $format = shift;
  my @vars;
  for my $var (sort keys %$format) {
    push @vars, sprintf("%s=%s",$var,$format->{$var});
  }
  return join(",",@vars);
}

sub get_handle {
  if(my $file = $CONF{input}) {
    die "No such file $file" unless -e $file;
    open(FILE,$file);
    $inputhandle = \*FILE;
  } else {
    $inputhandle = \*STDIN;
  }
}

sub validateconfiguration {

}

################################################################
#
# *** DO NOT EDIT BELOW THIS LINE ***
#
################################################################

sub populateconfiguration {
  foreach my $key (keys %OPT) {
    $CONF{$key} = $OPT{$key};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my $root     = shift;
  for my $key (keys %$root) {
    my $value = $root->{$key};
    if(ref($value) eq "HASH") {
      repopulateconfiguration($value);
    } elsif (ref($value) eq "ARRAY") {
      for my $item (@$value) {
        repopulateconfiguration($item);
      }
    } else {
      while($value =~ /__([^_].+?)__/g) {
        my $source = "__" . $1 . "__";
        my $target = eval $1;
        $value =~ s/\Q$source\E/$target/g;
      }
      $root->{$key} = $value;
    }
  }
}

sub loadconfiguration {
  my $file = shift;
  my ($scriptname) = fileparse($0);
  if(-e $file && -r _) {
    # great the file exists
  } elsif (-e "/home/$ENV{LOGNAME}/.$scriptname.conf" && -r _) {
    $file = "/home/$ENV{LOGNAME}/.$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/etc/$scriptname.conf";
  } elsif (-e "$FindBin::RealBin/../etc/$scriptname.conf" && -r _) {
    $file = "$FindBin::RealBin/../etc/$scriptname.conf";
  } else {
    return undef;
  }
  $OPT{configfile} = $file;
  my $conf = new Config::General(-ConfigFile=>$file,
																 -IncludeAgain=>1,
																 -AllowMultiOptions=>"yes",
																 -LowerCaseNames=>0,
																 -AutoTrue=>1);
  %CONF = $conf->getall;
}

sub printdebug {
  printinfo("debug",@_)  if $CONF{debug};
}

sub printdumper {
  printinfo(Dumper(@_));
}

sub printinfo {
  printf("%s\n",join(" ",@_));
}

