#!/usr/bin/env perl

=pod

=head1 NAME

randomdata - create a data file for Circos track with random data

=head1 SYNOPSIS

  randomdata -karyotype karyotype.txt -ruleset 5mb 

  randomdata -karyotype karyotype.txt -rule ". 2e6 150/50 50 1"

  # set min/max bounds
  randomdata -karyotype karyotype.txt -rule ". 2e6 150/50 50 1" -min 25 -max 65

=head1 DESCRIPTION

The ruleset is

  chr_rx  bin_size  avg  sd  sampling  options

=head1 OPTIONS

=head1 HISTORY

=over

=item * 16 Apr 2014

Chromosomes now sorted in output.

=item * 11 Sep 2013

Updated documentation.

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut

use strict;
use warnings FATAL=>"all";

use Carp;
use Config::General;
use Cwd qw(getcwd abs_path);
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use Storable;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";

our (%OPT,%CONF,$conf);
our @COMMAND_LINE = ("file=s",
										 "configfile=s",
										 "rule=s",
										 "min=f",
										 "max=f",
										 "karyotype=s",
										 "ruleset=s",
										 "help",
										 "man",
										 "cdump",
										 "debug:i");
our $VERSION = 0.01;

# common and custom module imports below
use Regexp::Common;
#use IO::File;
#use List::Util;
#use List::MoreUtils;
use Math::Random qw(random_normal random_set_seed);
use Set::IntSpan;
#use Statistics::Descriptive;

# read and parse configuration file
_parse_config();
_dump_config();

random_set_seed(time,$$);
my $kfile = -e $CONF{karyotype} ? $CONF{karyotype} : "$FindBin::RealBin/../$CONF{karyotype}";
my $k     = read_karyotype($kfile);

$CONF{chr_rx} =~ s/,/|/g;
my $chrs = [ grep($_ =~ /$CONF{chr_rx}/, keys %$k) ];

my $rules;

my @ruleset;
if($CONF{rule}) {
	@ruleset = ($CONF{rule});
} else {
	@ruleset = ref($CONF{rules}{$CONF{ruleset}}{rule}) ? @{$CONF{rules}{$CONF{ruleset}}{rule}} : ($CONF{rules}{$CONF{ruleset}}{rule});
}

for my $rule (@ruleset) {
	my ($rx,$bin,$avg,$sd,$sampling,$options) = split(" ",$rule);
	my $nchr = grep($_ =~ /$rx/, @$chrs);
	$sd       ||= 0;
	$sampling ||= 1;
	push @$rules, {
								 nchr => $nchr,
								 rx   => $rx,
								 bin  => $bin,
								 avg  => $avg,
								 sd   => $sd,
								 options  => $options,
								 sampling => $sampling,
								};
}

$rules = [ sort {$b->{nchr} <=> $a->{nchr} || $b->{sampling} <=> $a->{sampling}} @$rules ];

my $data;
for my $r (@$rules) {
	for my $c ( sort @$chrs ) {
		next unless $c =~ /$r->{rx}/i;
		# determine all positions
		my @pos = get_positions($r,$k->{$c}->cardinality);
		for my $pos (@pos) {
			next if $r->{sampling} && rand() > $r->{sampling};
	    my $value   = get_value($r);
	    my $options = get_options($r);
	    my @tok = ($c,@$pos);
	    push @tok, $value if defined $value;
	    push @tok, $options if defined $options;
			push @{$data->{$c}}, \@tok;
	    #printinfo(@tok);
		}
	}
}

for my $c (sort { (($a =~ /(\d+)/g)[0] || int(@$chrs)) <=> (($b =~ /(\d+)/g)[0] || int(@$chrs)) } keys %$data) {
	for my $datum (@{$data->{$c}}) {
		printinfo(@$datum);
	}
}

sub get_avg_sd {
    my $str = shift;
    my ($avg,$sd) = split(/$CONF{delim}{avg_sd}/,$str);
    die "Could not determine avg,sd from string [$str]" if ! defined $avg;
    $sd ||= 0;
    return ($avg,$sd);
}
sub get_nrd {
    my ($avg,$sd) = @_;
		my $grd;
		while(! defined $grd) {
			$grd = random_normal(1,$avg,$sd);
			$grd = undef if defined $CONF{min} && $grd < $CONF{min};
			$grd = undef if defined $CONF{max} && $grd > $CONF{max};
		}
    return sprintf("%.3f",$grd);
}

sub get_positions {
	my ($r,$chr_len) = @_;
	my @pos;
	my $curr_pos = 0;
	my $continue = 1;
	do {
		my ($start,$end);
		if($r->{bin} =~ /$CONF{delim}{interval}/) {
	    my ($len_str,$space_str)  = split(/$CONF{delim}{interval}/,$r->{bin});
	    my ($len_avg,$len_sd)     = get_avg_sd($len_str);
	    my ($space_avg,$space_sd) = get_avg_sd($space_str);
	    my ($len,$space);
	    while(! $len && ! $space) {
				$len   = int max(0,get_nrd($len_avg,$len_sd));
				$space = int max(0,get_nrd($space_avg,$space_sd));
	    }
	    $start = int($curr_pos + $space);
	    $end   = $len ? int($start + $len - 1) : $start;
	    $end   = min($chr_len,$end);
	    printdebug(1,"interval",$len,$space,$start,$end);
		} else {
			my ($avg,$sd) = get_avg_sd($r->{bin});
			my $len = -1;
			while($len < 0) {
				$len = int(get_nrd($avg,$sd));
			}
			$start = $curr_pos;
			$end   = $curr_pos + $len - 1;
			$end   = min($chr_len,$end);
		}
		if( $start >= $chr_len ) {
	    $continue = 0;
		}
		if($continue) {
	    push @pos, [$start,$end];
	    $curr_pos = $end + 1;
		}
	} while ( $continue );
	return @pos;
}

sub get_options {
	my $r = shift;
	return unless defined $r->{options} && $r->{options} =~ /$CONF{delim}{option_assignment}/;
	my @options;
	my @pairs = split(/$CONF{delim}{option}/,$r->{options});
	for my $pair (@pairs) {
		my ($var,$value) = split(/$CONF{delim}{option_assignment}/,$pair);
		my @values = split(/$CONF{delim}{option_or}/,$value);
		$value = $values[rand @values];
		if($value =~ /$CONF{delim}{avg_sd}/) {
	    $value = get_nrd(split(/$CONF{delim}{avg_sd}/,$value));
		}
		printdebug(1,"option","pair",$pair,"var",$var,"str",$value,"values",@values,"value",$value);
		push @options, sprintf("%s%s%s",$var,$CONF{delim}{option_assignment},$value);
	}
	return join($CONF{delim}{option},@options);
}

sub get_value {
	my $r = shift;
	my $value;
	if($r->{avg} =~ /$CONF{delim}{value_or}/) {
		my @values = split(/$CONF{delim}{value_or}/,$r->{avg});
		$value = $values[rand @values];
	} elsif ($r->{avg} ne ".") {
		my @values = map { get_nrd(split($CONF{delim}{avg_sd},$_)) } split($CONF{delim}{value},$r->{avg});
		$value = join($CONF{delim}{value},@values);
	} else {
		$value = undef;
	}
	return $value;
}

sub read_karyotype {
  my $file = shift;
  open(F,$file);
  my $k;
  while(<F>) {
    chomp;
    if(/^chr/) {
      my @tok = split;
      my ($chr,$start,$end) = @tok[2,4,5];
      $k->{$chr} = Set::IntSpan->new("$start-$end");
    }
  }
  return $k;
}

sub validateconfiguration {
  $CONF{chr_rx} ||= ".";
  $CONF{ruleset} ||= "default";
}

# HOUSEKEEPING ###############################################################

sub _dump_config {
	if($CONF{cdump}) {
    printdumper(\%OPT,\%CONF);
		exit;
	}
}

sub _parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration(); # copy command line options to config hash
  validateconfiguration(); 
  if(defined $CONF{debug} && $CONF{debug} == $dump_debug_level) {
    $Data::Dumper::Indent    = 2;
    $Data::Dumper::Quotekeys = 0;
    $Data::Dumper::Terse     = 0;
    $Data::Dumper::Sortkeys  = 1;
    $Data::Dumper::Varname = "OPT";
    printdumper(\%OPT);
    $Data::Dumper::Varname = "CONF";
    printdumper(\%CONF);
    exit;
  }
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my $root     = shift;
  return unless ref($root) eq "HASH";
  for my $key (keys %$root) {
      my $value = $root->{$key};
      if(ref($value) eq "HASH") {
	  repopulateconfiguration($value);
      } elsif (ref($value) eq "ARRAY") {
	  for my $item (@$value) {
	      repopulateconfiguration($item);
	  }
      } elsif(defined $value) {
	  while($value =~ /__([^_].+?)__/g) {
	      my $source = "__" . $1 . "__";
	      my $target = eval $1;
	      $value =~ s/\Q$source\E/$target/g;
	  }
	  $root->{$key} = $value;
      }
  }
}

################################################################
#
#

sub loadconfiguration {
  my $file = shift;
  if(defined $file) {
    if(-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
	"$cwd/$scriptname.conf",
	"$cwd/etc/$scriptname.conf",
	"$cwd/../etc/$scriptname.conf",
	"$bindir/$scriptname.conf",
	"$bindir/etc/$scriptname.conf",
	"$bindir/../etc/$scriptname.conf",
	"$userdir/.$scriptname.conf",
	);
    my @additional_files = (
	
	);
    for my $candidate_file (@additional_files,@candidate_files) {
	#printinfo("configsearch",$candidate_file);
	if(-e $candidate_file && -r _) {
	    $file = $candidate_file;
	    #printinfo("configfound",$candidate_file);
	    last;
	}
    }
  }
  if(defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
	-ConfigFile=>$file,
	-IncludeRelative=>1,
	-IncludeAgain=>1,
	-ExtendedAccess=>1,
	-AllowMultiOptions=>"yes",
	-LowerCaseNames=>1,
	-AutoTrue=>1
	);
    %CONF = $conf->getall;
  }
}

sub printdebug {
    my ($level,@msg) = @_;
    my $prefix = "debug";
    if(defined $CONF{debug} && $CONF{debug} >= $level) {
	printinfo(sprintf("%s[%d]",$prefix,$level),@msg);
    }
}

sub printinfo {
    print join(" ",@_),"\n";
}

sub printdumper {
    print Dumper(@_);
}

