/*
 *  vecGARSP.c
 *  
 *  An 128 bit AltiVec version of the optimized implementation of the GARSP algorithm.
 *	- LOOKUP_FIRSTBLANK() macro replaces the first[] array
 *	- BITOFLIST() macro replaces the bit[] lookup table
 *	- Merged search for s and the shift of LIST and COMP
 *	- Use temporary variables to shift LIST and COMP
 *
 *	- Use two 128Bit register as rulers and do all shifting, ORing and ANDing with it.
 *	- LOOKUP_FIRTSBLANK() and BITOFLIST() is still performed in the scalar integer unit.
 *
 *  Additionally a number of search space reduction techniques were applied (as limit)
 *	- midpoint reduction (eliminate mirror images)
 *	- centerpoint reduction (eliminate center symmetry)
 *	- maxpos reduction (the remaining marks must occupy at least the size of the respective OGR)
 *
 *  Created by Michael Feiri on Sat Jan 26 2002.
 *  Copyright (c) 2002 Michael Feiri. All rights reserved.
 *
 */

#include <stdio.h>
#include "first0.h" /* LOOKUP_FIRSTBLANK */
#include "chooseV3_12_12.h"
//#include "chooseV3_12_13.h" /* CHOOSE */
//#include "achooseV4_12_13.h" /* CHOOSE */
//#include "achooseV5_12_13.h" /* CHOOSE */

#define BITOFLIST(x) (0x80000000>>((x-1)&0x1f)) /*0x80000000 >> ((x-1) % 32)*/
#define MINIMUM(x,y) ((x<y) ? (x) : (y))

#define NUMWORDS 2	/* 2 words (2*128=256) -> we can create rulers up to a length of 255 */
                        /* but thanks to "principle #1" we can safely get rulers up to (256*2)-1=511 */
#define MAXMARKS 19	/* depth of stacks -> we can place up to 18 marks (plus the "0" mark means OGR-19) */

unsigned int reducedVecGARSP(int maxmarks, int maxlength); /* prototype */

unsigned int reducedVecGARSP(int maxmarks, int maxlength) {
int hack = 0;
    int depth;                    /* current mark we are placing corresponds to the D in the PDF */
    int count[MAXMARKS]; /* current ruler length (e.g. LENGTH[])*/
    vector unsigned int list[MAXMARKS][NUMWORDS];   /* list bitmap */
    vector unsigned int dist[MAXMARKS][NUMWORDS];   /* dist bitmap */
    vector unsigned int comp[MAXMARKS][NUMWORDS];   /* comp bitmap */
    vector unsigned int ZEROS = (vector unsigned int)(0);
    vector unsigned int ONES = vec_nor(ZEROS,ZEROS);
    union {
        vector unsigned int V;
        unsigned int U[4];
    } VU;
    int limit;
    unsigned int nodes=0;
    const int halfdepth=(maxmarks/2);
    const int halflength=(!(maxmarks%2) && (maxlength%2)) ? (maxlength/2) : (maxlength/2)-1;
    //const int halflength=(maxlength/2);

    const int OGR[] = { /*  1 */    0,   1,   3,   6,  11,  17,  25,  34,  44,  55,  72,
                        /* 12 */   85, 106, 127, 151, 177, 199, 216, 246, 283, 333, 356,
                        /* 23 */  372, 425, 480, 492, 553, 585, 623, 680, 747, 784 };


    /* Start at the very beginning (one mark set at position zero) */
    depth = 1;
    count[0] = 0;
    count[1] = 0;
    
    list[depth][0]=ZEROS;
    list[depth][1]=ZEROS;

    dist[depth][0]=ZEROS;
    dist[depth][1]=ZEROS;

    comp[depth][0]=ZEROS;
    comp[depth][1]=ZEROS;

    while (1) {

unsigned int dist0 = (VU.V = (dist[depth][0]), VU.U[0]);

/* CHOOSE */

if (CHOOSEMARKS>maxmarks-depth) {
    limit = maxlength-chooseV3[(dist0>>(32-CHOOSEBITS))][(maxmarks-depth)];
} else {
    limit = maxlength-OGR[maxmarks-depth]; // midpoint reduction may supercede this
}

/*
if ((CHOOSEMARKS+2>maxmarks-depth) && (2<=maxmarks-depth)) {
    limit = maxlength-chooseV4[(dist0>>(32-CHOOSEBITS))][(maxmarks-depth)-2];
} else {
    limit = maxlength-OGR[maxmarks-depth]; // midpoint reduction may supercede this
}
*/

/* MIDPOINT */

    if (depth<=halfdepth) {
        int tempLimit;
        

        if (CHOOSEMARKS>halfdepth-depth) {
            tempLimit = halflength-chooseV3[(dist0>>(32-CHOOSEBITS))][(halfdepth-depth)];
        } else {
            tempLimit = halflength-OGR[halfdepth-depth];
        }
/*
        if ((CHOOSEMARKS+2>halfdepth-depth) && (2<=halfdepth-depth)) {
            tempLimit = halflength-chooseV4[(dist0>>(32-CHOOSEBITS))][(halfdepth-depth)-2];
        } else {
            tempLimit = halflength-OGR[halfdepth-depth];
        }
*/

        limit=MINIMUM(limit,tempLimit);
//    if (depth==halfdepth) {
//        limit=MINIMUM(limit,halflength);
    } else if ((depth==(halfdepth+1)) && (maxmarks%2)) {
        int tempLimit;
        tempLimit = maxlength-1-count[halfdepth];
        limit=MINIMUM(limit,tempLimit);
        //printf("l%d r%d\n",count[halfdepth],maxlength-1-count[halfdepth]);
    }


/*{
            if ((count[3]==38)&&(count[2]==28)&&(count[1]==5)) {
            int i;
            for(i=0; i<depth; i++ ) printf("%4d",count[i]);
            printf("\n");
            }
}*/

nodes++;
        /* Find s = the next available mark location for this level */
        /* shift LIST s steps to the right and shift COMP s steps to the left */
    stay:
{
        unsigned int comp0 = (VU.V = (comp[depth][0]), VU.U[0]); /* ????????? */
        if (comp0 < 0xfffffffe) {
            int s = LOOKUP_FIRSTBLANK( comp0 );
            if ((count[depth] += s) > limit)   goto up; /* no spaces left */
            /*EXTEND_MARK(lev, s); *//* shift by s (LIST to the right, COMP to the left) */
            /* s is guaranteed to be < 32 !!! */
            {
                vector unsigned int Vs;
                vector unsigned int Vm;
                vector unsigned int Vss;
                VU.U[3] = s;
                Vs = vec_splat(VU.V,3);
                Vm = vec_sl(ONES,Vs);
                Vss = vec_sub(ZEROS,Vs);
                comp[depth][0] = vec_rl(comp[depth][0],Vs);
                comp[depth][1] = vec_rl(comp[depth][1],Vs);
                comp[depth][0] = vec_sel(vec_sld(comp[depth][0],comp[depth][1],4),comp[depth][0],Vm);
                comp[depth][1] = vec_sel(vec_sld(comp[depth][1],ZEROS,4),comp[depth][1],Vm);
                list[depth][1] = vec_sel(vec_sld(list[depth][0],list[depth][1],12),list[depth][1],Vm);
                list[depth][0] = vec_sel(vec_sld(ZEROS,list[depth][0],12),list[depth][0],Vm);
                list[depth][1] = vec_rl(list[depth][1],Vss);
                list[depth][0] = vec_rl(list[depth][0],Vss);
               }
        } else { /* s>=32 */
            if ((count[depth] += 32) > limit)  goto up; /* no spaces left */
            /*EXTEND_MARK_BY_WORD(lev); *//* shift by 32 (LIST to the right, COMP to the left) */
            {
                comp[depth][0] = vec_sld(comp[depth][0], comp[depth][1], 4);
                comp[depth][1] = vec_sld(comp[depth][1], ZEROS, 4);
                list[depth][1] = vec_sld(list[depth][0], list[depth][1], 12);
                list[depth][0] = vec_sld(ZEROS, list[depth][0], 12);
            }
            if (comp0 == 0xffffffff)    goto stay;
        }
}
        /* did we find a ruler with the desired number of marks? */
        if (depth == (maxmarks)) {
            int i;
        hack++;
            printf("FOUND A RULER!!!(%d)\n",hack);
            for(i=0; i<=maxmarks; i++ ) printf("%4d",count[i]);
            printf("\n");
            continue; /* go back to the start of our while(1) loop*/
        } else {
        /* if we didnt distribute all marks we need to go deeper...*/
        
            /* prepare LIST[D+1]=LIST+(new_mark)*/
            int d = count[depth]-count[depth-1];
            
            //VU.V = ZEROS;
            union {
                vector unsigned int V;
                unsigned int U[4];
            } VU;
            VU.U[3] = 0;
            VU.V = vec_splat(VU.V,3);
            
            if (d <= 128) {
                if( d <= 32 ) {
                    VU.U[0]=BITOFLIST( d );
                } else if( d <= 64 ) { 
                    VU.U[1]=BITOFLIST( d );
                } else if( d <= 96 ) {
                    VU.U[2]=BITOFLIST( d );
                } else {  /* d <= 128 */
                    VU.U[3]=BITOFLIST( d );
                }
                list[depth+1][0] = vec_or(list[depth][0],VU.V); /* vec_add should work too, faster?? */
                list[depth+1][1] = list[depth][1];
            } else { /* d<=256 */
                if( d <= 160 ) {
                    VU.U[0]=BITOFLIST( d );
                } else if( d <= 192 ) { 
                    VU.U[1]=BITOFLIST( d );
                } else if( d <= 224 ) {
                    VU.U[2]=BITOFLIST( d );
                } else {  /* d <= 256 */
                    VU.U[3]=BITOFLIST( d );
                }
                list[depth+1][0] = list[depth][0];
                list[depth+1][1] = vec_or(list[depth][1],VU.V); /* vec_add should work too, faster?? */
            }
            
            /* prepare the new DIST[D+1]=DIST|LIST[D+1]*/
            dist[depth+1][0] = vec_or(list[depth+1][0],dist[depth][0]);
            dist[depth+1][1] = vec_or(list[depth+1][1],dist[depth][1]);

            /* prepare COMP[D+1]=COMP|DIST[D+1]*/
            comp[depth+1][0] = vec_or(dist[depth+1][0],comp[depth][0]);
            comp[depth+1][1] = vec_or(dist[depth+1][1],comp[depth][1]);
            
            count[depth+1] = count[depth]; /* pass on our current length*/
            depth++;			   /* one level deeper*/

            continue; /* go back to the start of our while(1) loop*/
        }
        
    up:
        if (--depth == 0) return nodes;/* EXIT if we reached end of search space */
    }

    return nodes;
}

#ifndef AS_MODULE

#include <sys/time.h>
#include <sys/resource.h>

double dtime()
{
    struct rusage rusage;
    double q;

    getrusage(RUSAGE_SELF,&rusage);

    q = (double)(rusage.ru_utime.tv_sec);
    q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
	
    return q;
}

int main (int argc, const char * argv[])
{
    double starttime, benchtime;
    unsigned int nodes;
    
    printf("\nreducedVecGARSP started...\n");
    
    starttime = dtime();
    nodes = reducedVecGARSP(13,127);
    benchtime = dtime() - starttime;
    
    printf("Nodes per second = %.3f ( %u nodes/ %.3f seconds)\n",nodes/benchtime,nodes,benchtime);    
    
    return 0;
}

#endif
