#!/usr/bin/perl

# Gabriel L. Somlo - 2004,2005

# QueryTracker is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2, or (at your option) any
# later version.
#
# QueryTracker is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with QueryTracker; see the file COPYING.  If not, write to the Free
# Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.  



# global shared variables:
my %idfdocs; # list of documents (unique by URL) used in IDF
my $d; # number of documents used in IDF
my %df; # document frequency by term, used in IDF
my %stophash; # stopwords that need to be ignored in tf-idf processing


# global variables set from configfile:
my $mailto;
my $queryfile;
my $stopfile;
my $vardir;
my $varlockf;
my $disfile;
my $fbkfile;
my $profile;
my $maxhits;
my $prefer_minmx_thr;
my $init_threshold;
my $adapt_alpha;
my $minmx_alpha;
my $thrmethod;
my $wg_ua = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.2) Gecko/20040806";

# load config from configfile
##### FIXME - make this load a real config file !
sub load_config {
  my $confname = shift; # name of configfile to read

  $mailto = 'somlo@acns.colostate.edu';
  $queryfile = "/home/QueryTracker/Query.list";
#  $queryfile = "/home/QueryTracker/Query.list.test";
  $stopfile = "/home/QueryTracker/stopword.ini";
  $vardir = "/home/QueryTracker/var";
#  $vardir = "/home/QueryTrackerFoo/var";
  $varlockf = "$vardir/QueryTracker.lock";
  $disfile = "DissemHits";
  $pretrn = "_pretrn";	# name of dir. containing pre_train positive examples
  $fbkfile = "FbkLog";
  $profile = "Profile";
  $maxhits = 100; # max. number of downloaded hits per query
  $prefer_minmx_thr = 1; # prefer min-max profile threshold instead of adaptive
  $init_threshold = 0.2; # initial value for adaptive profile threshold
  $adapt_alpha = 0.09; # learning rate for dissemination thresholds
  $minmx_alpha = 0.04; # min-max distance fraction for setting threshold
  $thrmethod = "circ"; # threshold checking method (lin, sqr, circ)
  # what wget should use to lie about its user_agent identity:
  $wg_ua = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.2) Gecko/20040806";
} # load_config()


my $log_date;
sub log_entry {
  my $string = shift;
  print( "QueryTracker: $log_date: $string\n" );
} # log_entry()


# return date strings for today and yesterday
sub get_date_strings {
  my $stamp = time;
  my @t = localtime( $stamp );
  $today = sprintf( "%02d-%02d-%02d", $t[5] % 100, $t[4] + 1, $t[3] );
  @t = localtime( $stamp - 86400 ); # 60*60*24, or 24 hours earlier
  $yesterday = sprintf( "%02d-%02d-%02d", $t[5] % 100, $t[4] + 1, $t[3] );
  return( $today, $yesterday );
}


# given a query, remove all documents with no user feedback from previous day
# if any documents had feedback on the previous day, add them to the profile
sub process_yesterday_feedback {
  my $query = shift;
  my $yesterday = shift;
  my ($name, $terms, $rth, $qth) = split( /:/, $query );
  my $dir = "$vardir/$name";

  log_entry( "proc_yester_fbk: $name: starting." );

  # read in and delete feedback file
  unless ( open(FBK, "$dir/$fbkfile") ) {
    system( "rm -rf $dir/$yesterday" ); # no feedbck, remove yesterday's results
    log_entry( "proc_yester_fbk: $name: no feedback, deleting $yesterday." );
    return;
  }
  my @fbkdocs = grep( /^$yesterday/, <FBK> );
  chomp( @fbkdocs );
  close( FBK );
  system( "rm -f $dir/$fbkfile" );
  log_entry( "proc_yester_fbk: $name: Got feedback list." );

  # build hash of feedback documents
  my %feedback;
  for my $fdoc ( @fbkdocs ) {
    my ($date, $dfname, $rel) = split( /\/| /, $fdoc );
    $feedback{$dfname} = $rel;
  }
  log_entry( "proc_yester_fbk: $name: Done building feedback hash." );

  # transcribe disseminated documents that were fed back;
  # also append positives to profile
  my $profile_modified = 0;
  system( "mv -f $dir/$yesterday/$disfile $dir/$yesterday/$disfile.orig" );
  open( ODIS, "$dir/$yesterday/$disfile.orig" ) or
                         die( "Can't open $name/$yesterday/$disfile.orig: $!" );
  open( NDIS, ">$dir/$yesterday/$disfile" ) or
                        die( "Can't create $name/$yesterday/$disfile: $!" );
  while( <ODIS> ) {
    chomp;
    my ($url, $status, $dfname) = split;
    if ( !$feedback{$dfname} ) {
      system( "rm -f $dir/$yesterday/$dfname" ); # delete no-feedback document
      next; # also skip writing it to the new document list
    }
    print( NDIS "$_\n" ); # write yes-feedback document to new list
    # if we open the profile from perl we'll create it before we know
    # that we actually want to write to it. Using echo only creates the file
    # when there's something we want to write to it...
    if ( $feedback{$dfname} eq "R" ) {
      system( "echo $yesterday/$dfname >> $dir/$profile" );
      $profile_modified = 1;
    }
  }
  close( ODIS );
  close( NDIS );
  # unique the profile (in case of multiple feedback for same doc.)
  if ( $profile_modified ) {
    system( "sort -u $dir/$profile > $dir/$profile.tmp; \
             mv -f $dir/$profile.tmp $dir/$profile" );
  }
  log_entry( "proc_yester_fbk: $name: Done rewriting $profile." );

} # process_yesterday_feedback()


# given a query, add latest version of each distinct url to %idfdocs hash
# return hash ref. containing query-specific latest version for each url
# also ret. array ref. containing pre_train documents 
sub select_idf_documents {
  my $query = shift;
  my ($name, $terms, $rth, $qth) = split( /:/, $query );
  my $dir = "$vardir/$name";

  log_entry( "select_idf_doc: $name: starting." );

  # get results stored from previous days -- or create new query dir.
  my @pdates;
  if ( opendir(QDIR, $dir) ) {
    @pdates = sort( grep({!/^\./ && !/$pretrn/&& -d "$dir/$_"} readdir(QDIR)) );
    closedir( QDIR );
  }
  log_entry( "select_idf_doc: $name: Got list of previous dates." );

  # add most recent ver. of all previous url to %query_urls hash
  my %query_urls;
  for my $d ( @pdates ) {
    open( DIS, "$dir/$d/$disfile" ) or
                             die( "Can't open $disfile from $d for $name: $!" );
    while ( <DIS> ) {
      chomp;
      my ($url, $status, $dfname) = split;
      # if $query_urls{$url} is undefined or older than date $d, overwrite it
      $query_urls{$url} = "$d/$dfname" if ( "$d/$dfname" gt $query_urls{$url} );
    }
    close( DIS );
  }
  log_entry( "select_idf_doc: $name: Done building query_urls hash." );

  # grab al the pre_train document names, if any
  my @query_traindocs;
  if ( opendir(TRNDIR, "$dir/$pretrn") ) {
    @query_traindocs = grep( {-f "$dir/$pretrn/$_"} readdir(TRNDIR) );
    closedir( TRNDIR );
    for my $i (0..$#query_traindocs) {
      $query_traindocs[$i] =~ s/^/$pretrn\//;	# prepend "_pretrn/" dir. name
    }
    log_entry( "select_idf_doc: $name: Done loading pre_train documents." );
  }

  # add contents of %query_urls to global %idfdocs hash
  # might need to do this with mutex access to %idfdocs !!!
  while( ($url, $datedfname) = each(%query_urls) ) {
    if ( $idfdocs{$url} ) {
      my @dirsplit = split( /\//, $query_urls{$url} );
      my $glob_date = $dirsplit[$#dirsplit-1];
      my $local_date = (split(/\//, $datedfname))[0];
      # make sure latest version is added in case of a collision
      $idfdocs{$url} = "$dir/$datedfname" if( $local_date gt $glob_date );
    } else {
      $idfdocs{$url} = "$dir/$datedfname";
    }
  }

  # add contents of @query_traindocs to global %idfdocs hash
  # no way to check for collisions, so make up unique key for each doc. added
  for my $td ( @query_traindocs ) {
    $idfdocs{"$dir/$td"} = "$dir/$td";
  }

  log_entry( "select_idf_doc: $name: Done adding to idfdocs hash." );

  return( \%query_urls, \@query_traindocs );

} # select_idf_documents()


## ps2ascii is broken; we need a patched ps2ascii.ps file,
## and to unroll the ps2ascii command:
my $ps2ascii_ps = "/home/QueryTracker/ps2ascii.ps";
my $ps2pdf_cmd = "/usr/bin/gs -q -dNODISPLAY -dSAFER -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save -f $ps2ascii_ps - -c quit";

# given a path to an html file, return ref. to hash containing the tf vector
sub doc2tfvec {
  my $filename = shift;
  die( "File \"$filename\" does not exist: $!" ) if ( ! -e $filename );
  my $filetype = `file $filename`;
  if ( $filetype =~ /PostScript/ ) {	# open as postscript file:
    open( DOCUMENT, "cat $filename | $ps2pdf_cmd |") or
                           die( "Could not open as postscript: $filename: $!" );
  } elsif ( $filetype =~ /PDF document/ ) {	# open as pdf file:
    open( DOCUMENT, "/usr/bin/pdftotext $filename - |") or
                           die( "Could not open as pdf: $filename: $!" );
  } else {	# default assumption is html:
    open( DOCUMENT, "/usr/bin/lynx -dump -nolist -force-html $filename |") or
                           die( "Could not open as html: $filename: $!" );
  }
  binmode( DOCUMENT, ":crlf" );	# bypas the ${LANG} env. variable
  local $/;                     # enable "slurp" mode
  local $_ = <DOCUMENT>;        # get whole file into $_
  close( DOCUMENT );
  s/\W/ /g;                     # get rid of non-word characters
  s/\b\w\b/ /g;                 # get rid of single letters or digits
  s/\b\w*[\137]+\w*\b/ /g;      # get rid of words containing _
  s/\b[[:xdigit:]]+\b/ /g;      # get rid of dec or hex numbers
  s/\b\d+\w+\b/ /g;             # get rid of words starting with digits
  s/\b[[:alpha:]]+\d+\w+\b/ /g; # get rid of words starting with letter-digit
  s/\b\w{15,}\b/ /g;            # get rid of words longer than 20 chars
  s/\s+/ /g;                    # fix spaces
  my %tf;
  for my $term ( split ) {
    $term = lc( $term );
    next if ( $stophash{$term} > 0 );
    $tf{$term} ++;
  }
  return( \%tf );
} # doc2tfvec()


# compute the square of a value
sub square {
  my $val = shift;
  return( $val * $val );
} # square()


# computes similarity between 2 tf vectors: similarity( \v1, \v2, \df, d )
sub similarity {
  my $v1ref = shift;
  my $v2ref = shift;
  my $dfref = shift;
  my $d = shift;
  my $s = 0;
  my $l1 = 0;
  my $l2 = 0;
  my %dfterms;
  @dfterms{keys(%$v1ref), keys(%$v2ref)} = (); # union of terms in v1 and v2

  for my $k ( keys(%dfterms) ) {
    if( $$dfref{$k} > 0 ) {
      my $idf2 = square( log($d/$$dfref{$k}) );
      $s += $$v1ref{$k} * $$v2ref{$k} * $idf2;
      $l1 += square( $$v1ref{$k} ) * $idf2;
      $l2 += square( $$v2ref{$k} ) * $idf2;
    }
  }

  return( 0 ) if ( $s == 0 );
  return( -1 ) if ( $l1 * $l2 == 0 );
  return( $s / sqrt($l1 * $l2) );
} # similarity()


# given two arrays of strings, return 0 if they are equal, 1 if they differ
sub list_cmp {
  my $ref1 = shift;
  my $ref2 = shift;
  return( 1 ) if ( @$ref1 != @$ref2 ); # not-equal, different length
  for my $i  ( 0..$#$ref1 ) {
    return( 1 ) if ( $$ref1[$i] ne $$ref2[$i] ); # not-equal, elements differ
  }
  return( 0 ); # equal
} # list_eq()


# given a $query and $maxhits, return a list of up to $maxhits URLs from Google
sub scrape_google {
  my $query = shift;
  my $maxhits = shift;
  my $start = 0;
  my @old;
  my @res;

  while ( @res < $maxhits ) {
    #sleep( 1 + rand(5) ) if ( $start > 0 );  # no-hammering delay, 1..5 seconds
    sleep( 5 + rand(25) );  # no-hammering delay, 5..30 seconds
    my $url = "http://www.google.com/search?as_q=$query&num=100&start=$start";
    open( SEARCH, "wget \"$url\" -U \"$wg_ua\" -q -O - |" ) or
                                                die( "Search error: $url $!" );
    local $/; # slurp mode
    my @new = grep( !/google|\/search\?q=cache:/,
                                        (<SEARCH> =~ /<a href=(http[^>]*)>/g) );
    close( SEARCH );
    last if ( @new == 0 || list_cmp(\@old, \@new) == 0 );
    @old = @new;
    $start += @new;
    @res = ( @res, grep(!/\.[rR][tT][fF]$|\.[dD][oO][cC]$|\.[pP][pP][tT]$/, @new) );
  }
  $#res = $maxhits - 1 if ( @res > $maxhits ); # trim down to $maxhits
  return( \@res );
} # scrape_google()


# given a similarity, a threshold, and a method, return boolean decision
# method can be lin-linear, sqr-square, circ-circular, and has to do with
#   the probability distribution model when sim < thr
sub should_disseminate {
  my $sim = shift;
  my $thr = shift;
  my $method = shift;

  die( "parameter out of spec: thr=$thr, sim=$sim\n" )
    if ( $thr < 0 || $thr > 1 || $sim < 0 || $sim > 1 );
  return( 1 ) if ( $sim >= $thr );
  # sim < threshold, scale it up to [0,1] from [0,thr]
  $sim /= $thr;
  # if method is sqr, square sim to make the probability of passing smaller
  $sim *= $sim if ( $method eq "sqr" );
  # if method is circ, place sim on a quarter-of-circle
  if ( $method eq "circ" ) {
    $sim *= $sim;
    $sim = 1 - sqrt( 1 - $sim );
  }
  return( 1 ) if ( $sim >= rand(1) );
  return( 0 );
} # should_disseminate()


# given olddoc and newdoc references, return references to addvec and delvec
sub diffvec {
  my $oldref = shift;
  my $newref = shift;
  my %terms; # union of terms in old and new documents:
  @terms{keys(%$oldref), keys(%$newref)} = ();
  my %addvec;
  my %delvec;
  for my $k ( keys(%terms) ) {
    my $cnt = $$newref{$k} - $$oldref{$k};
    if ( $k ne "" ) {
      $addvec{$k} += $cnt if ( $cnt > 0 );
      $delvec{$k} -= $cnt if ( $cnt < 0 );
    }
  }
  return( \%addvec, \%delvec );
} # diffvec()


# given a profile tf vector, return the terms sorted by decreasing tfidf weight
sub get_top_tfidf {
  my $profvecref = shift;
  my $dfref = shift;
  my $d = shift;
  my %weights; # tf-idf weights

  for my $k ( keys(%$profvecref) ) {
    if( $$dfref{$k} > 0 ) {
      $weights{$k} = $$profvecref{$k} * log( $d / $$dfref{$k} );
    }
  }

  return( sort({$weights{$b} <=> $weights{$a}} keys(%weights)) );

} # get_top_tfidf()


# given conditional probability vectors for pos and neg class, return terms
# sorted by decreasing [p(term|+) / p(term|-)]
sub get_top_bayes {
  my $refpos = shift;
  my $refneg = shift;
  my %weights; # [p(term|+) / p(term|-)] weights

  for my $k ( keys(%$refpos) ) {
    $weights{$k} = $$refpos{$k} - $$refneg{$k};
  }

  return( sort({$weights{$b} <=> $weights{$a}} keys(%weights)) );

} # get_top_bayes()


# given conditional probability vectors for pos and neg class, return terms
# sorted by decreasing odds-ratio
sub get_top_oddsr {
  my $refpos = shift;
  my $refneg = shift;
  my %weights; # [p(term|+) / p(term|-)][(1-p(term|-)) / (1-p(term|+))] weights

  for my $k ( keys(%$refpos) ) {
    $weights{$k} = $$refpos{$k} - $$refneg{$k} +
                   log(1 - exp($$refneg{$k})) - log(1 - exp($$refpos{$k}));
  }

  return( sort({$weights{$b} <=> $weights{$a}} keys(%weights)) );

} # get_top_oddsr()


# download and disseminate hits from an array reference; modify $dishashref
sub dnld_dis {
  my $name = shift;
  my $hitsref = shift;
  my $dir = shift;
  my $today = shift;
  my $meth = shift;		# o (orig) or g (generated)
  my $urlhashref = shift;
  my $qvecref = shift;
  my $rth = shift;
  my $qth = shift;
  # profile info:
  my $useprofile = shift;
  my $pvecref = shift;	# tfidf positive profile vector
  my $pth = shift;	# dissemination threshold
  my $use_neg = shift;
  my $negpvecref = shift; # tfidf negative profile vector
  # bayes profile info:
  my $logpppos = shift;
  my $logppneg = shift;
  my $logtpposref = shift;
  my $logtpnegref = shift;
  # disseminated documents - i/o parameter
  my $dishashref = shift;

  log_entry( "dnld_dis: $name: starting method $meth, docs 0 - $#$hitsref" );

  for my $rank ( 0..$#$hitsref ) {
    my $url = $$hitsref[$rank];
    my $newdocf = "$dir/$today/$meth\_$rank";	# temp name for downloading doc.
    my $filename = "$meth";			# starting point for final name
                                                # q,r,t,b will be appended
    my $dis_flag = 0;				# disseminate ? (deflt.=no)
    my $status;					# document status (New or Chg)
    my $docvref;				# reference to doc vector

    # download document under temporary filename (meth_rank)
    #  wget options used: -U to lie about the user agent
    #                     -t1 to limit to one attempt to download
    #                     -T60 to timeout after 60 seconds if server not resp.
    if( system( "wget \"$url\" -U \"$wg_ua\" -t1 -T60 -q -O $newdocf" ) != 0 ) {
      system( "rm -f $newdocf" ); # skip out if we can't download !
      log_entry( "dnld_dis: $name: wget: skipping $meth $rank $url" );
      next;
    }

    # dissemination checks

    if ( $$urlhashref{$url} ) { # previously seen - check for relevant *changes*

      my $olddocf = "$dir/$$urlhashref{$url}";
      my ($add, $del) = diffvec( doc2tfvec("$olddocf"), doc2tfvec("$newdocf") );
      $docvref = $add;		# use additions as doc. vector for checks
      $status = "Chg";

      if ( keys(%$docvref) > 0 ) { # additions exist, try to disseminate
        my $querysim = similarity( $docvref, $qvecref, \%df, $d );
        if ( should_disseminate($querysim, $qth, $thrmethod) ) {
          $dis_flag = 1;
          $filename .= "q";	# append reason for dissemination to filename
        }
      }

    } else { # new - check for relevance of document *itself*

      $docvref = doc2tfvec( "$newdocf" );	# use doc. itself for checks
      $status = "New";

      # check rank threshold
      my $ranksim = 1 - $rank / ($maxhits - 1); # convert rank to [0,1]
      if ( should_disseminate($ranksim, $rth, $thrmethod) ) {
        $dis_flag = 1;
        $filename .= "r";	# append reason for dissemination to filename
      }

    } # end if ( $$urlhashref{$url} )

    # now check against profile for either new or changed doc.
    if ( $useprofile && (keys(%$docvref) > 0) ) {

      # tf-idf profile check
      my $profsim = similarity( $docvref, $pvecref, \%df, $d );
      if ( should_disseminate($profsim, $pth, $thrmethod) ) {
        $dis_flag = 1;
        $filename .= "t";	# reason for dissemination: tfidf.threshold
      }

      # neg.tfidf and bayes profile check
      if ( $use_neg ) {

        # check against positive-negative tfidf profile
        my $negprofsim = similarity( $docvref, $negpvecref, \%df, $d );
        if ( $profsim > $negprofsim ) {
          $dis_flag = 1;
          $filename .= "u";	# reason for dissemination: tfidf.posneg
        }
        log_entry( "dnld_dis: 2-tfidf: $name: $status: $profsim $negprofsim" );

        # check against bayes profile
        my $vpos = $logpppos;
        my $vneg = $logppneg;
        log_entry( "dnld_dis: bayes-prior: $name: $status: $rank $vpos $vneg $url" );
        for my $k ( keys(%$docvref) ) {
          $vpos += $$logtpposref{$k} * $$docvref{$k};
          $vneg += $$logtpnegref{$k} * $$docvref{$k};
        }
        if ( $vpos > $vneg ) {
          $dis_flag = 1;
          $filename .= "b";	# reason for dissemination: bayes
        }
        log_entry( "dnld_dis: bayes-final: $name: $status: $rank $vpos $vneg $url" );
      }

    } # end if ( $useprofile )

    # end dissemination checks

    # carry out dissemination decision
    if ( $dis_flag ) {
      # disseminate
      if ( $$dishashref{$url} ne "" ) {	# passed before
        my ($oldstat, $oldfilename) = split( ' ', $$dishashref{$url} );
        my $newfilename = "$filename\_$oldfilename";
        system( "rm -f $dir/$today/$oldfilename" );	# remove previous copy
        system( "mv -f $newdocf $dir/$today/$newfilename" ); # rename new copy
        $$dishashref{$url} = "$oldstat $newfilename"; # also rename in %dishash
      } else {
        system( "mv -f $newdocf $dir/$today/$filename\_$rank" );
        $$dishashref{$url} = "$status $filename\_$rank"; # passing first time
      }
      log_entry( "dnld_dis: $name: $status: $filename $rank $url" );
    } else {
      # do not disseminate
      system( "rm -f $newdocf" ); # did not pass, do not disseminate, delete
      log_entry( "dnld_dis: $name: $status: delete $meth $rank $url" );
    } # end if ( $dis_flag )

  } # end for my $rank ( 0..$#$hitsref )

  log_entry( "dnld_dis: $name: done method $meth" );

} # dnld_dis()


# given a query and a reference to a url-unique hash of latest doc. versions
# download and disseminate new hits as required
sub process_query {
  my $query = shift;
  my $trainref = shift;
  my $urlhashref = shift;
  my $today = shift;
  my ($name, $terms, $rth, $qth) = split( /:/, $query );
  my $dir = "$vardir/$name";
  my %queryvec;
  my $pth = $init_threshold;
  my $useprofile = 0;
  # tf-idf profile
  my %profvec;
  my $use_neg = 0;
  # naive bayes profile
  my %negprofvec;
  my $logpppos;
  my $logppneg;
  my %logtppos;
  my %logtpneg;

  my @posdocvecs;	# list of positive sample tf vectors
  my @posdocs;	# list of positive sample file names
  my @negdocvecs;	# list of negative sample tf vectors

  log_entry( "process_query: $name: starting." );

  # build query tf vector
  for my $t ( split(/\+/, $terms) ) { $queryvec{$t} += 1; }

  # add feedback to user profile
  if ( open(PROF, "$dir/$profile") ) {

    $useprofile = 1;

    while ( <PROF> ) {
      chomp;
      my ($date, $dfname) = split;
      # doc file name $dfname consists of literal method name followed by rank.
      my $rank = ($dfname =~ /(\d*)$/)[0];
      my $docvecref = doc2tfvec( "$dir/$_" );
      # update rank threshold:
      my $ranksim = 1 - $rank / ($maxhits - 1); # convert rank to [0,1]
      $rth *= (1 - $adapt_alpha);
      $rth += $adapt_alpha * $ranksim; # equiv. to rth += alpha * (sim - rth)
      # update query threshold:
      my $querysim = similarity( \%queryvec, $docvecref, \%df, $d );
      $qth *= (1 - $adapt_alpha);
      $qth += $adapt_alpha * $querysim; # equiv. to qth += alpha * (sim - qth)
      # update profile vector and threshold:
      if( @posdocvecs > 0 ) {
        my $profsim = similarity( \%profvec, $docvecref, \%df, $d );
        $pth *= (1 - $adapt_alpha);
        $pth += $adapt_alpha * $profsim; # equiv. to pth += alpha * (sim - pth)
        for my $k ( keys(%$docvecref) ) { $profvec{$k} += $$docvecref{$k}; }
      } else {
        for my $k ( keys(%$docvecref) ) { $profvec{$k} = $$docvecref{$k}; }
      }
      # add docvecref to list
      push( @posdocvecs, $docvecref );
      push( @posdocs, "$_" );	# so we can later tell docs are *not* positive !
    }
    close( PROF );
  }

  # add pre_train documents to user profile
  for my $td ( @$trainref ) {

    $useprofile = 1;

    # read in document vector
    my $docvecref = doc2tfvec( "$dir/$td" );
    # update query threshold:
    my $querysim = similarity( \%queryvec, $docvecref, \%df, $d );
    $qth *= (1 - $adapt_alpha);
    $qth += $adapt_alpha * $querysim; # equiv. to qth += alpha * (sim - qth)
    # update profile vector and threshold:
    if( @posdocvecs > 0 ) {
      my $profsim = similarity( \%profvec, $docvecref, \%df, $d );
      $pth *= (1 - $adapt_alpha);
      $pth += $adapt_alpha * $profsim; # equiv. to pth += alpha * (sim - pth)
      for my $k ( keys(%$docvecref) ) { $profvec{$k} += $$docvecref{$k}; }
    } else {
      for my $k ( keys(%$docvecref) ) { $profvec{$k} = $$docvecref{$k}; }
    }
    # add docvecref to list
    push( @posdocvecs, $docvecref );
    # no need to place document *name* on @posdocs array !
  }

  if ( $useprofile ) {
    # should we replace profile threshold ?
    if ( $prefer_minmx_thr && @posdocvecs > 1 ) {
      my $min = 1;
      my $max = 0;
      for my $docvecref ( @posdocvecs ) {
        my $profsim = similarity( \%profvec, $docvecref, \%df, $d );
        $min = $profsim if ( $min < $profsim );
        $max = $profsim if ( $max > $profsim );
      }
      # replace profile threshold with min-max computed one.
      log_entry( "process_query: $name: replaced adapt pth was $pth." );
      $pth = $min + $minmx_alpha * ($max - $min);
      log_entry( "process_query: $name: minmx pth is $pth." );
    } else {
      log_entry( "process_query: $name: adapt pth is $pth." );
    }
    log_entry( "process_query: $name: rth is $rth, qth is $qth." );
    log_entry( "process_query: $name: Done building profile." );

    # compute set diff of all docs for query (values urlhashref) and
    # positive documents (posdocs) to obtain negatives; collect negative
    # docs' tf vector in @negdocvecs array, and add them to the %negprofvec hash
    for my $dfname ( sort values(%$urlhashref) ) {
      my $found = 0;
      for my $k ( @posdocs ) {
        if ( $dfname eq $k ) {
          $found = 1;
          last;
        }
      }
      next if ( $found );
      my $docvecref = doc2tfvec( "$dir/$dfname" );
      # update negative profile vector
      if( @negdocvecs > 0 ) {
        for my $k ( keys(%$docvecref) ) { $negprofvec{$k} += $$docvecref{$k}; }
      } else {
        for my $k ( keys(%$docvecref) ) { $negprofvec{$k} = $$docvecref{$k}; }
      }
      # add docvecref to list
      push( @negdocvecs, $docvecref );
    }

    # train naive Bayes classifier from @posdocvecs and @negdocvecs
    # ( $use_neg; $logpppos; $logppneg; %logtppos; %logtpneg; )
    # also build negative TF vector
    my $nb_dpos = @posdocvecs;
    my $nb_dneg = @negdocvecs;
    if( $nb_dpos > 0 && $nb_dneg > 0 ) {
      $use_neg = 1;
      my %nb_df;	# query-specific DF vector from pos- and negdocvecs
      my %nb_ptf;	# query-specific TF vector for positive class
      my %nb_ntf;	# query-specific TF vector for negative class
      # cycle through @posdocvecs and @negdocvecs, build above three hashes
      for my $docvecref ( @posdocvecs ) {
        for my $k ( keys(%$docvecref) ) {
          $nb_ptf{$k} += $$docvecref{$k};
          $nb_df{$k} ++;
        }
      }
      for my $docvecref ( @negdocvecs ) {
        for my $k ( keys(%$docvecref) ) {
          $nb_ntf{$k} += $$docvecref{$k};
          $nb_df{$k} ++;
        }
      }
      my $nb_nterms = keys( %nb_df );
      $logpppos = log( $nb_dpos / ($nb_dpos + $nb_dneg) );
      $logppneg = log( $nb_dneg / ($nb_dpos + $nb_dneg) );
      my $nb_tcpos= 0; for my $k ( keys(%nb_ptf) ) { $nb_tcpos += $nb_ptf{$k}; }
      my $nb_tcneg= 0; for my $k ( keys(%nb_ntf) ) { $nb_tcneg += $nb_ntf{$k}; }
      for my $k ( keys(%nb_df) ) {
        $logtppos{$k} = log( ($nb_ptf{$k} + 1) / ($nb_tcpos + $nb_nterms) );
        $logtpneg{$k} = log( ($nb_ntf{$k} + 1) / ($nb_tcneg + $nb_nterms) );
      }
    } else {
      log_entry( "process_query: $name: Not using bayes." );
    }

  } else {
    log_entry( "process_query: $name: Not using profile." );
  }

  # create query directories:
  mkdir( $dir, 0700 ) unless ( -d $dir );
  mkdir( "$dir/$today", 0700 ) unless ( -d "$dir/$today" );

  my %dishash;	#disseminated documents are listed in this hash

  # retrieve search engine results for the query, then download and disseminate
  my $hitsref = scrape_google( $terms, $maxhits );
  dnld_dis( $name, $hitsref, $dir, $today, "o", $urlhashref,
            \%queryvec, $rth, $qth, $useprofile, \%profvec, $pth,
            $use_neg, \%negprofvec,
            $logpppos, $logppneg, \%logtppos, \%logtpneg,
            \%dishash );

  # Generate alternative queries
  if ( $useprofile ) {

    #### TF-IDF based query
    # get top tfidf terms from profile
    my @top_tfidf_terms = get_top_tfidf( \%profvec, \%df, $d );
    # generate query from top profile tfidf-weighted terms
    my $tfidf_qgen_terms = join( '+', @top_tfidf_terms[0..3] );
    log_entry( "process_query: $name: TFIDF vector starts with: \"@top_tfidf_terms[0..9]\.\.\"" );
    log_entry( "process_query: $name: TFIDF query is: \"$tfidf_qgen_terms\"" );
    log_entry( "process_query: $name: orig. query was: \"$terms\"" );
    # retrieve search engine results, download and disseminate
    my $tfidf_hitsref = scrape_google( $tfidf_qgen_terms, $maxhits );
    dnld_dis( $name, $tfidf_hitsref, $dir, $today, "g", $urlhashref,
              \%queryvec, $rth, $qth, $useprofile, \%profvec, $pth,
              $use_neg, \%negprofvec,
              $logpppos, $logppneg, \%logtppos, \%logtpneg,
              \%dishash );

    #### Bayes based queries
    if ( $use_neg ) {
      # get top bayes terms from profile
      my @top_bayes_terms = get_top_bayes( \%logtppos, \%logtpneg );
      # generate query from top ranked bayes terms
      my $bayes_qgen_terms = join( '+', @top_bayes_terms[0..3] );
      log_entry("process_query: $name: BAYES query is: \"$bayes_qgen_terms\"" );
      log_entry( "process_query: $name: orig. query was: \"$terms\"" );
      # retrieve search engine results, download and disseminate
      my $bayes_hitsref = scrape_google( $bayes_qgen_terms, $maxhits );
      dnld_dis( $name, $bayes_hitsref, $dir, $today, "h", $urlhashref,
                \%queryvec, $rth, $qth, $useprofile, \%profvec, $pth,
                $use_neg, \%negprofvec,
                $logpppos, $logppneg, \%logtppos, \%logtpneg,
                \%dishash );

      # get top odds-ratio terms from profile
      my @top_oddsr_terms = get_top_oddsr( \%logtppos, \%logtpneg );
      # generate query from top ranked bayes terms
      my $oddsr_qgen_terms = join( '+', @top_oddsr_terms[0..3] );
      log_entry("process_query: $name: ODDSR query is: \"$bayes_qgen_terms\"" );
      log_entry( "process_query: $name: orig. query was: \"$terms\"" );
      # retrieve search engine results, download and disseminate
      my $bayes_hitsref = scrape_google( $oddsr_qgen_terms, $maxhits );
      dnld_dis( $name, $bayes_hitsref, $dir, $today, "i", $urlhashref,
                \%queryvec, $rth, $qth, $useprofile, \%profvec, $pth,
                $use_neg, \%negprofvec,
                $logpppos, $logppneg, \%logtppos, \%logtpneg,
                \%dishash );
    }

  } # end generate alternative queries

  # save disseminated documents to the $disfile
  open( DIS, ">$dir/$today/$disfile" ) or
                                   die( "Can't open $dir/$today/$disfile: $!" );
  # sort by New first, Chg last; also, sort incrementally by rank.
  for my $url ( sort { my ($ast, $ark) = ( $dishash{$a} =~ /^(\w*) .*_(\d*)$/ );
                       my ($bst, $brk) = ( $dishash{$b} =~ /^(\w*) .*_(\d*)$/ );
                       if ( $bst cmp $ast ) { return( $bst cmp $ast ); }
                       return( $ark <=> $brk );
                     }
                keys(%dishash) ) {
    print( DIS "$url $dishash{$url}\n" );
  }
  close( DIS );

  log_entry( "process_query: $name: finished." );

} # process_query()


# given a query, link $dir/$disfile to $dir/$today/$disfile
sub link_result_files {
  my $query = shift;
  my $today = shift;
  my ($name, $terms, $rth, $qth) = split( /:/, $query );
  my $dir = "$vardir/$name";

  system( "rm -f $dir/$disfile; ln -s $today/$disfile $dir/$disfile" );
} # link_result_files();


###############################################################################
# Main program begins here:
###############################################################################

# load config file here
# FIXME : load a real config file !
load_config( "foobar" );

# send mail that we're starting
system( "echo start | mail -s \"QueryTracker Starting...\" $mailto" );

# load stopwords into %stophash
if ( $stopfile ne "" ) {
  open( STOP, $stopfile ) or die( "Can't open $stopfile: $!" );
  while( <STOP> ) {
    chomp;
    $stophash{lc($_)} = 1;
  }
  close( STOP );
}

# acquire lock on config here
system( "lockfile $varlockf" );

# set today and yesterday date strings:
my ($today, $yesterday) = get_date_strings;
$log_date = $today;
log_entry( "starting." );

# load queries:
open( QUERIES, $queryfile ) or die( "File open error: $!" );
my @queries = <QUERIES>;
close( QUERIES );
chomp( @queries );
log_entry( "queries loaded." );

# delete unrated results, update profile
for my $q ( @queries ) { process_yesterday_feedback( $q, $yesterday ); }
log_entry( "profile updated." );

# release lock on config here
system( "rm -f $varlockf" );

# build query-specific and global lists of unique urls documents (%idfdocs hash)
my %query_hashrefs;
my %query_trainrefs;
for my $q ( @queries ) {
  ($query_hashrefs{$q}, $query_trainrefs{$q}) = select_idf_documents( $q );
}
log_entry( "IDF documents selected." );

# build IDF ($d and %df)
#$d = keys( %idfdocs );
$d = 0;
for my $idfdoc ( values(%idfdocs) ) {
  my $docvecref = doc2tfvec( $idfdoc );
  for my $k ( keys(%$docvecref) ) { $df{$k} ++; }
  $d ++;
}
my $nterms = keys( %df );
log_entry( "IDF built: $d docs $nterms terms." );

# process queries ( read only on shared data, no need to lock anything )
my @children;
for my $q ( @queries ) {
  my $pid = fork;

  if ( $pid ) {
    push( @children, $pid );
  } else {
    die( "can't fork: $!" ) unless defined ( $pid );
    process_query( $q, $query_trainrefs{$q}, $query_hashrefs{$q}, $today );
    exit( 0 );
  }
}
for my $child ( @children ) { wait; }
log_entry( "queries processed." );

# acquire lock on config here
system( "lockfile $varlockf" );

for my $q ( @queries ) { link_result_files( $q, $today ); }
log_entry( "finished." );

# release lock on config here
system( "rm -f $varlockf" );

# send mail that we're done
system( "echo done | mail -s \"QueryTracker Finished...\" $mailto" );
