#!/usr/bin/env perl
use warnings;
use strict;
use FindBin;
use lib "$FindBin::RealBin/../perl5";
###LINE_FOR_BREW_CONDA###
use Snippy::Version;
use Data::Dumper;
use Bio::Tools::GFF;
use Bio::SeqIO;
use Fatal;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Globals

my $VERSION = Snippy::Version->version;
my $EXE = $FindBin::RealScript;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Command line checking

my(@Options, $debug, $auto, $vcf, $ref, $gff, $version);
setOptions();

if ($auto) {
  $vcf ||= 'snps.vcf';
  $ref ||= 'reference/ref.fa';
  $gff ||= 'reference/ref.gff';
}

$vcf or die "need --vcf <snps.vcf>";
$ref or die "need --ref <ref.fa>";

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Create an index of seq:pos => feature

my %seq;
print STDERR "Loading reference: $ref\n";
my $fa = Bio::SeqIO->new(-file=>$ref, -format=>'fasta');
while (my $seq = $fa->next_seq) {
  $seq{ $seq->id } = $seq;
}
my $nseq = scalar keys %seq;
print STDERR "Loaded $nseq sequences.\n";

my %feat;
my %olap;
my $nfeat=0;

if ($gff) {
  print STDERR "Loading features: $gff\n";
  my $in = Bio::Tools::GFF->new(-file=>$gff, -gff_version=>3);
  while (my $f = $in->next_feature) {
    $f->attach_seq( $seq{ $f->seq_id } );
    $feat{++$nfeat} = $f;
    for my $pos ($f->start .. $f->end) {
      $olap{$f->seq_id}->[$pos] = $nfeat;  # FIXME: should be splice ?
    }
  }
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Create a SNP table file

my @ANNO = qw(FTYPE STRAND NT_POS AA_POS EFFECT LOCUS_TAG GENE PRODUCT);
my $count=0;
print join("\t", qw(CHROM POS TYPE REF ALT EVIDENCE), @ANNO), "\n";

print STDERR "Parsing variants: $vcf\n";
open VCF, '<', $vcf;
while (<VCF>) {
  next if m/^#/;
  chomp;

  # [VCF] CHROM POS ID REF ALT QUAL FILTER INFO [FORMAT DATA DATA2 ...]
  my($chr,$pos,undef,$ref_base,$alt_base,undef,undef,$tags) = split m/\t/;
  my $tag = vcf_tags_to_hash($tags);

  my %anno = (map { ($_ => '') } @ANNO);
  
  my $aff = '';
  # FIXME - need to do span on MNP or COMPLEX
  my $aff_featid = $olap{ $chr }[ $pos ]; # does this overlap a feature?
  if ($aff_featid) {
    my $f = $feat{$aff_featid};
    $anno{FTYPE} = $f->primary_tag;
    $anno{STRAND} = $f->strand < 0 ? '-' : $f->strand > 0 ? '+' : '.';
    for my $t (@ANNO) {
      $anno{$t} = ($f->get_tag_values(lc($t)))[0] if $f->has_tag(lc($t));
    }

    # convert snpEff result into something readable!
    ##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 
    # 0         1            2                   3           4         5              6             7                    8
    # 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID  | Transcript_BioType | Rank 
    #   9        10       11                       12                     13                   14         15
    # | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'
    if ($tag->{ANN}) {
      my @eff = split m/\|/, $tag->{ANN};
      $anno{NT_POS} = $eff[11];
      $anno{AA_POS} = $eff[13];
      # print STDERR Dumper($tag) if !defined $eff[1] or !defined $eff[10] or !defined $eff[1];
      $anno{EFFECT} = join(' ', map { defined($_) ? $_ : '' } @eff[1,9,10] );
    }
    else {
      $anno{EFFECT} = "variant did not have ANN annotation";
    }
  }
  
  my @evid;
  # freebayes VCF seems to use different tags when TYPE=indel ...
  my $RO = $tag->{RO} || '0';
  my $AO = $tag->{AO} || '0';  
  push @evid, "$alt_base:$AO", "$ref_base:$RO";
  my $evid = join(' ', @evid);

  print join("\t", 
    $chr, $pos, $tag->{TYPE}, $ref_base, $alt_base, $evid, (map { $anno{$_} || '' } @ANNO)
  ), "\n";
  $count++;  
}
print STDERR "Converted $count SNPs to TAB format.\n";

#----------------------------------------------------------------------
# name1=tag1;name2;name3=tag3 => HASH

sub vcf_tags_to_hash {
  my($s) = @_;
  my @pairs = split m/;/, $s;
  my %tag;
  for my $p (@pairs) {
    my @x = split m/=/, $p;
    $tag{$x[0]} = $x[1];  # will be =>undef for bareword tags eg. INDEL;
  }
  return \%tag;
}

#----------------------------------------------------------------------
sub show_version {
  print "$EXE $VERSION\n";
  exit(0);
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    {OPT=>"help!",     VAR=>\&usage,                DESC=>"This help"},
    {OPT=>"version!",  VAR=>\&show_version,         DESC=>"Print version and exit"},
    {OPT=>"debug!",    VAR=>\$debug,   DEFAULT=>0,  DESC=>"Output verbose debug info"},
    {OPT=>"auto!",     VAR=>\$auto,    DEFAULT=>0 , DESC=>"Autoset --vcf/ref/gff to default Snippy names"},
    {OPT=>"vcf=s",     VAR=>\$vcf,     DEFAULT=>'', DESC=>"VCF input file"},
    {OPT=>"ref=s",     VAR=>\$ref,     DEFAULT=>'', DESC=>"FASTA reference sequence"},
    {OPT=>"gff=s",     VAR=>\$gff,     DEFAULT=>'', DESC=>"GFF reference features"},
  );

  (!@ARGV) && (usage());

  &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();

  # Now setup default values.
  foreach (@Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  select STDERR;
  print "SYNOPSIS\n  $EXE $VERSION - convert Snippy VCF into TSV with column breakdown\n";
  print "USAGE\n  $EXE [options] --ref ref.fa [--gff ref.gff] --vcf snps.vcf > snp.tab\n";
  print "OPTIONS\n";
  foreach (@Options) {
    printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
           defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
  }
  exit(1);
}
 
#----------------------------------------------------------------------
