/**********************************************/
// fastBuildnaiveSearch:
//   heap.cpp:  The heap class implements the position heap
/**********************************************/
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include "heap.h"
#include "downNode.h"
#include "generic.h"
#include "mylist.h"
using std::cout;
using std::cin;
using std::endl;


/*********************************************************************
This file demonstrates how to build the position heap for a text in O(n) time,
where n is the length of the text.  The key elements of the algorithm are
in the 'build' procedure.  

Once again, we can "name" a node by the sequence of edge labels from
the root to the node.  This is a string of letters from the text's alphabet.
See the naive implementation of the build operation for a definition
of the (primal) position heap.  The nodes of the dual position heap
are the same as the nodes of the primal, so each node has a parent in the 
primal heap and a parent in the dual.  Here's the definition of the
dual heap:  it is the tree where the name of each node is the reverse
of its name in the primal position heap.

As with the naive construction algorithm, we build the heap from right
to left in the text.  Let arrayIndex be the current position to be
added.  As with the naive construction algorithm when
you add a new node for the current position, it must be a child of
the longest prefix of text[arrayIndex..0] that is already a node x
of the heap.   The only thing that differs is how we find x.

The naive construction algorithm, which is also implemented on this
site, indexes into the heap on text[arrayIndex..0], starting at the root,
until it "falls off the tree," that is, until it finds a node with no 
child on the next character.   The last node reached is x.

The algorithm below climbs from the most recently added node in the
heap, until is finds a node reachable in the dual on character
c = text[arrayIndex].  This node is x.  The new node is added as a child
of x.  

The reason this is linear is that the new node is a grandchild of
the highest node reached, and on the next iteration you start climbing
from the new node.  On each iteration, you descend two levels from
the highest node reached.  The total amount you climb
over all iterations is bounded by the total amount you descend, which
is two levels times n = 2n = O(n).

Notice that you are always climbing in the position heap and looking
downward in the dual heap.  Therefore, during construction, you can
store the position heap as an upwardly-directed tree (one parent pointer
for each node) and the dual heap as a downwardly-directed tree
(a list of children for each node).  Once constructed, the position
heap must be a downwardly-directed tree to be used for searching.
So once construction is complete, you can discard the dual heap
and convert the primal position heap into a downwardly directed tree.

The naive algorithm takes two integers per character of text.  Because of the
need for parent pointers, the algorithm in this file takes three
integers per character of text during construction, and two integers
per character thereafter.

The naive construction takes O(n^2) time in the worst case, but this
is overly pessimistic in most practical applications.  Some large
input files are provided on the site so that you can compare the
difference in running times of the two algorithms on them.
***************************************************************/

/****************************************/
// position heap constructor.  Builds the position heap
//  for the text pointed to by 'str'
/****************************************/
heap::heap(char *str)
{
    textLength = strlen (str);    // length of text

    // upwardly-directed rooted tree for holding the (primal) position
    // heap during construction ...
    parent = new int [textLength]; 

    // downwardly-directed rooted tree for holding the dual heap during
    //   construction, and also the primal heap when it's been constructed
    //   and is ready for use ...
    downArray = new downNode[textLength];

    // Private version of text.  If you want to keep storage cost down to two 
    //  integers per character of text, you should use the text pointed to 
    //  by *str, rather than keeping a private copy of the text.  
    text = new char[textLength];   
    if (! downArray || !text) 
         {cout << "Memory allocation failure in heap constructor\n"; exit(1);}
    
    char *p1 = str;  char *p2 = text + textLength - 1;
    while (*p1 != '\0') // reverse the indexing order to be from
        *p2-- = *p1++;   //    right to left in private copy of 'text'
    build();                       // build the position heap for the string
}

// position heap destructor ...
heap::~heap()
{
    delete []downArray; 
    delete []parent;
    delete []text;
}

/*******************************************/
// build:  Build the position heap.  Assume text has been reversed in its 
// array so that the indices are in ascending order from right to left.
/*******************************************/
void heap::build ()
{
    int pathNode, child;  // current node on path up, potential parent of 
                          //   new node
    int prevPathNode;  // child of pathNode on way up
    int depth;    // depth of pathNode
    for (int arrayIndex = 1; arrayIndex < textLength; arrayIndex++)
    {
        if ((arrayIndex % 100000) == 0) 
               cout << "Text position: " << arrayIndex << '\n';
        char *textptr = text + arrayIndex; // Next character on indexing path
        
        if (childOnLetter(root, 0, *textptr) == NOCHILD)
        {
            parent[arrayIndex] = root;
            insertChild(arrayIndex, root);
            pathNode = arrayIndex;
            depth = 1;
        }
        else
        {
            char c = text[arrayIndex];

            // Starting at the most recently added node, climb in the primal 
            // position heap until you find a child on the new letter c in 
            // the dual heap.  That child is the longest prefix of 
            // text[arrayIndex..0] that is already a node of the primal 
            // heap.  The new node must therefore be added as a child of 
            // it in the primal heap.  In the dual heap, the new node must 
            // be added to the highest node you saw that doesn't have a 
            // child on letter 'c'.

            // climb ...
            do
            {
                prevPathNode = pathNode;
                depth--;
                pathNode = parent[pathNode];
                child = childOnLetter(pathNode, 0, c);
            } while (child == NOCHILD);  
           
            // add new node to primal heap
            parent[arrayIndex] = child;

            // add new node to dual heap
            insertChild(arrayIndex, prevPathNode);

            // record new node and its depth in preparation for next iteration
            pathNode = arrayIndex;
            depth+=2;
        }
    }

    // Turn heap from an upwardly directed tree in parent array to a downwardly
    //  directed tree in downArray, discarding the dual heap ...
    delete [] downArray;        
    downArray = new downNode[textLength]; 
    for (int arrayIndex = 1; arrayIndex < textLength; arrayIndex++)
        insertChild(arrayIndex, parent[arrayIndex]);
}

/**************************************/
// insertChild:  insert 'child' as a child of 'parent' 
/**************************************/
void heap::insertChild(int child, int parent)
{
    downArray[child].setSibling(downArray[parent].getChild());
    downArray[parent].setChild(child);
}

/*************************************/
//  search:  Find all occurrences of 'pattern' in the text from which the
//  position heap was constructed.  Return an array containing the indices
//  of the positions (numbered right to left) and store the number of 
//  positions in 'numOccurrences.  This array must be deallocated when
//  when it is no longer used to avoid a memory leak.
/*************************************/
mylist *heap::search(char *pattern, int patternLength)
{
   int lastNode;  // NOCHILD if we fall of the tree while indexing on pattern.
                  // Otherwise, it's the last node of indexing path.

   //  observe convention of making indices descend from left to right
   reverse (pattern, patternLength); 

   // Find all positions on indexing path that are occurrences of pattern ...
   mylist *Occurrences = pathOccurrences(pattern, patternLength, lastNode);

   // If pattern was exhausted during indexing, append positions in subtree
   //  rooted at lastNode ...
   if (lastNode != NOCHILD) appendSubtreeOccurrences(lastNode, Occurrences);

   // un-reverse the user's pattern string to leave it in its original state
   reverse (pattern, patternLength); 
   
   // Allocate an integer array and copy the positions to it ...
   Occurrences->compact();
   return Occurrences;
}


/****************************
// childOnLetter:  Find the child reachable from 'node' on character c; 
//  'nodeDepth' is the depth of 'node'
******************************/
int heap::childOnLetter(int node, int nodeDepth, char c)
{
   int child = downArray[node].getChild();
   while (child != NOCHILD && text[child - nodeDepth] != c)
      child = downArray[child].getSibling();
   return child;
}

/*****************************
//  Index into heap on pattern, returning a list of all positions corresponding
//  to nodes on the path where the pattern occurs.
//
//  The method allocates the list of positions; this must be deleted
//  when it is no longer needed.
******************************/
mylist *heap::pathOccurrences(char *pattern, int patternLength, int &lastNode)
{
    int pathNode, child;  // parent and child on indexing path
    int depth;            // depth of pathNode
    // Indexing path has at most 'patternLength' positions on it
    mylist *Occurrences = new mylist(); 
    if (! Occurrences) {cout << "Memory allocation failure in pathOccurrences\n"; exit(1);}
    
    child = depth = 0;
    char *patPtr = pattern + patternLength - 1;  // start at left end of pattern
    do
    {
        pathNode = child;
        // the first depth characters of the pattern are known to match at
        // this node's position, because these are spelled out by the edges to
        // the node.  Check the last patternLength - depth characters to see 
        // if they also match.  If so, report the position as a match.
        if (isOccurrence(pattern, 
                         patternLength - depth, 
                         pathNode - patternLength+1))
            Occurrences->add(pathNode);
        child = childOnLetter(pathNode, depth++, *patPtr--);
    } while (child != NOCHILD && depth < patternLength);
    lastNode = child;
    return Occurrences;
}


/**************************
 *  isOccurrence:  Tell whether reverse of a pattern string matches at 
 *  *ending position* 'pos', given a pointer to the rightmost character 
 *  (the one with index 0) in the pattern string.  That is, tell whether
 *  the pattern and the text match when the pattern is lined up with its
 *  rightmost position at 'pos'.   
*****************************/
bool heap::isOccurrence(char *pattern, int patternLength, int pos)
{
   if (pos < 0) return false;
   char *textptr = text + pos;
   
   for (int i = 0; i < patternLength; i++)
      if (*pattern++ != *textptr++) 
          return false;
   return true;
}

/****************************
 *  If you didn't fall off the tree while indexing in on the pattern
 *  string, then all positions corresponding to descendants of the
 *  last node on the indexing path are also occurrences of the pattern.
 *  Append them to the list of places where the pattern string occurs.
*****************************/
void heap::appendSubtreeOccurrences(int node, mylist *Occurrences)
{
    Occurrences->add(node);   // append root of subtree
    for (int child = downArray[node].getChild();  //recursively append
            child != NOCHILD;                          //  descendants
            child = downArray[child].getSibling())
       appendSubtreeOccurrences (child, Occurrences);
}

/******************************
Display the shape of the heap tree using indented preorder ...
*******************************/
void heap::preorderPrint()
{
    if (downArray)
       preorderPrint(0,0);
}

void heap::preorderPrint (int index, int depth)
{
    if (index == NOCHILD) return;
    else
    {
       for (int i = 0; i < depth; i++)
          cout << ' ';
       cout << "Node " << index << "  Depth " << depth;
       cout << "  Children: ";
       for (int child = downArray[index].getChild(); 
                child != NOCHILD; 
                child = downArray[child].getSibling())
             cout << '(' << text[child-depth] << ',' << child << ')';
       cout << '\n';
       for (int child = downArray[index].getChild(); 
                child != NOCHILD; 
                child = downArray[child].getSibling())
            preorderPrint(child, depth+1);
    }
}
int heap::getTextLength()
{
    return textLength;
}

