// Lemur OLAP library (c) 2003 National Research Council of Canada by Daniel Lemire, and Owen Kaser
 /**
 *  This program is free software; you can
 *  redistribute it and/or modify it under the terms of the GNU General Public
 *  License as published by the Free Software Foundation (version 2). This
 *  program is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 *  details. You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
#ifndef PEARSONNORMALIZATION_H
#define PEARSONNORMALIZATION_H

#include "normalizations.h"
#include "../lemurcore/common.h"

/*
 *  Warning: This appears to work *very* poorly in practice.
 *
 *  The motivation behind PearsonNormalization is 
 *  that FrequencySort is somewhat global. Indeed, it was designed
 *	by Owen with the one-chunk storage method in mind. That's
 *	a fundamental flaw and it limits the efficiency
 *	of FrequencySort.
 *
 *
 *
 *  Here's a typical problem we don't see to know how to handle
 *  
 *  {1,0,1,0}
 *  {0,1,1,1}  =   A
 *  {1,1,1,0}
 *  {0,1,0,1}
 *
 *  which can be normalized to
 *  
 *  {1,1,0,0}
 *  {0,1,1,1}
 *  {1,1,1,0}
 *	{0,0,1,1}
 *	
 *  by switching columns 2 and 3 (out of columns 1,2,3,4) and then to
 *   
 *  {1,1,0,0}
 *  {1,1,1,0}
 *  {0,1,1,1}
 *  {0,0,1,1}
 *
 *  by switching rows 2 and 3. The results appears to me to be optimal in a 2x2
 *  chunk kind of way (two full chunks). However, none of our current schemes
 *  can find this! I think that's because we still think very much in
 *  terms of "one-chunk" in almost everything we do. I think this is a
 *  fundamentally flawed way of looking at things in our context.
 *
 * 
 */
template <class _DT, class _LDT>
class PearsonNormalization : public Normalization<_DT,_LDT> {
public:

    PearsonNormalization(){}

    virtual PearsonNormalization* clone() const { return new PearsonNormalization(*this);}

    virtual ~PearsonNormalization() {}

    virtual const string getTextName(void) const {
        return	 "pearson normalization" ;
    }

    virtual norm_type computeNormalFrom( DataCube<_DT,_LDT>& DC,
            const norm_type & initial);
private:

    enum { verbose = false, paranoid = false};

    virtual float similarity( DataCube<_DT,_LDT>& DC,
                              const int dimension, const int index1, const int index2);
};

//
// The implementation follows
//
//



// This is really the trivial part. You'd want to optimize this or play
// with it.
template <class _DT, class _LDT>
norm_type PearsonNormalization<_DT,_LDT>::computeNormalFrom( DataCube<_DT,_LDT>& DC,
        const norm_type & initial) {

    vector<int> shape = DC.getShape();
    assert(shape.size() == initial.size());
    norm_type answer(shape.size());
    for(uint dim = 0; dim < shape.size(); ++dim) {// we proceed dimension per dimension
        //cout << " doing dimension " << dim << " out of " << shape.size() << endl;// computations are long...
        deque<int> currentnormalization(initial[dim].begin(),  initial[dim].end());// create a copy
        vector<int> & newnormalization =  answer[dim];// a reference to the answer
        newnormalization = vector<int>(shape[dim]);
        // the first one is never changed by this algorithm
        //
        // Let's see an example, suppose currentnormalization is
        //				{0->2},{1->1},{2->0}
        // We will start by using 2 as a starting point {0->2}, then
        // we will compare 2 with 1 and 0 and use the best...
        //
        int index = 0;
        //deque<pair<uint64,int> > histo =  sortedFrequencyHistogram(DC, dimension);
        //		int largest = max_element(histo)->second;
        {
            deque<int>::iterator iterbegin = currentnormalization.begin();
            newnormalization[index] = *iterbegin;
            currentnormalization.erase(iterbegin);
        }
        // that was the first step, but I want to remove it from initial
        // now, we enter the real thing
        while(currentnormalization.size() > 0) {
            // seek the next best
            deque<int>::iterator it = currentnormalization.begin();
            float max = similarity(DC, dim, newnormalization[index], *it);
            if(verbose) cout << " comparing index = " <<  newnormalization[index] << " with "
                << *it << " = " << max << endl;
            deque<int>::iterator best = it;
            deque<int>::iterator temp = ++it;
            while(temp !=	currentnormalization.end()) {
                float current = similarity(DC, dim,newnormalization[index],*temp);
                if(verbose) cout << " comparing index = " <<  newnormalization[index] << " with "
                    << *it << " = " <<   current << endl;
                if(current > max) {
                    max = current;
                    best = temp;
                }
                ++temp;
            }
            // the best choice is at it
            newnormalization[++index] = *best;
            if(paranoid) assert( similarity(DC, dim,newnormalization[index - 1],newnormalization[index]) == max ) ;
            currentnormalization.erase(best);
        }
        assert(newnormalization.size() == (uint) shape[dim]);// sanity
        assert(PermutationUtil::isPermutation(newnormalization));
    }
    return answer;
}


// That's where the "hard work" is done.
template <class _DT, class _LDT>
float PearsonNormalization<_DT,_LDT>::similarity( DataCube<_DT,_LDT>& DC, const int dimension,
        const int index1, const int index2) {
    assert(index1 >= 0);
    assert(index2 >= 0);
    assert(dimension >= 0);
    assert(index1 != index2);
    vector<int> shape = DC.getShape();
    assert((uint) dimension < shape.size());
    assert( index1 < shape[dimension]);
    assert( index2 < shape[dimension]);
    vector<int> start(shape.size(),0);
    start[dimension] = index1;
    const int offset = index2 - index1;
    vector<int> bounds = shape;
    bounds[dimension] = index1 + 1;
    _LDT ScalarProduct = 0, Energy1 = 0, Energy2 = 0, Count = 0;
    vector<int> indices (start);
    vector<int> offsetindices;
    do {// everything inside this loop should be lightning fast if possible
        ++Count;
        offsetindices = indices;// that's a bit on the slow side
        offsetindices[dimension] += offset;
        const bool allocated1 = (DC.get(indices) != 0);
        const bool allocated2 = (DC.get(offsetindices) != 0);
        if(allocated1) {
            ++Energy1;
            if(allocated2) {
                ++Energy2;
                ++ScalarProduct;
            }
        } else if (allocated2) ++Energy2;
    } while(MathUtil::increment( indices, start, bounds) );
    // next we compute the pearson correlation as such
    if(verbose)
        cout << " ScalarProduct = " << ScalarProduct <<
        " Energy1 = "<< Energy1 << " Energy2 = " << Energy2 << endl;
    if((Energy1 == 0) || (Energy2 == 0)) return 0.0f;
    const float average1 = Energy1 / Count;
    const float average2 = Energy2 / Count;
    const float norm1 = sqrt((float) Energy1 - 2 * average1 * Energy1 + average1 * average1);
    const float norm2 = sqrt((float) Energy2 - 2 * average2 * Energy2 + average2 * average2);
    const float normproduct = norm1 * norm2;
    if(normproduct == 0.0f) return 0.0f;
    const float product = ScalarProduct - average1 * Energy2 - average2 * Energy1 + average1 * average2;
    return product / normproduct ;
}
//(x1 - avg1) (x2 - avg2) = x1 * x2 - avg1 *x2 - avg2 * x1 + avg1 * avg2
// (x1 - avg1) ( x1 - avg1) = x1 * x1 - 2* avg1 * x1  + avg1 **2
// That's it folks. It was easy, hey?

#endif



