/*
 *  vecGARSP.c
 *  
 *  An 128 bit AltiVec version of the optimized implementation of the GARSP algorithm.
 *	- LOOKUP_FIRSTBLANK() macro replaces the first[] array
 *	- BITOFLIST() macro replaces the bit[] lookup table
 *	- Merged search for s and the shift of LIST and COMP
 *	- Use temporary variables to shift LIST and COMP
 *
 *	- Use two 128Bit register as rulers and do all shifting, ORing and ANDing with it.
 *	- LOOKUP_FIRTSBLANK() and BITOFLIST() is still performed in the scalar integer unit.
 *
 *  Created by Michael Feiri on Mon Jan 07 2002.
 *  Copyright (c) 2002 Michael Feiri. All rights reserved.
 *
 */

#include <stdio.h>
#include "first0.h" /* LOOKUP_FIRSTBLANK */

#define NUMWORDS 2	/* 2 words (2*128=256) -> we can create rulers up to a length of 255 */
                        /* but thanks to "principle #1" we can safely get rulers up to (256*2)-1=511 */
#define MAXMARKS 19	/* depth of stacks -> we can place up to 18 marks (plus the "0" mark means OGR-19) */

#define BITOFLIST(x) (0x80000000>>((x-1)&0x1f)) /*0x80000000 >> ((x-1) % 32)*/

unsigned int vecGARSP(int maxmarks, int maxlength); /* prototype */

unsigned int vecGARSP(int maxmarks, int maxlength) {

    int depth;                    /* current mark we are placing corresponds to the D in the PDF */
    unsigned int count[MAXMARKS]; /* current ruler length (e.g. LENGTH[])*/
    vector unsigned int list[MAXMARKS][NUMWORDS];   /* list bitmap */
    vector unsigned int dist[MAXMARKS][NUMWORDS];   /* dist bitmap */
    vector unsigned int comp[MAXMARKS][NUMWORDS];   /* comp bitmap */
    vector unsigned int ZEROS = (vector unsigned int)(0);
    vector unsigned int ONES = vec_nor(ZEROS,ZEROS);
    union {
        vector unsigned int V;
        unsigned int U[4];
    } VU;
    unsigned int nodes=0;

    /* Start at the very beginning (one mark set at position zero) */
    depth = 1;
    count[0] = 0;
    count[1] = 0;
    
    list[depth][0]=ZEROS;
    list[depth][1]=ZEROS;

    dist[depth][0]=ZEROS;
    dist[depth][1]=ZEROS;

    comp[depth][0]=ZEROS;
    comp[depth][1]=ZEROS;

    /* search through the search space **** */
    while (1) {
nodes++;

        /* Find s = the next available mark location for this level */
        /* shift LIST s steps to the right and shift COMP s steps to the left */
    stay:
{
        unsigned int comp0 = (VU.V = (comp[depth][0]), VU.U[0]); /* ????????? */
        if (comp0 < 0xfffffffe) {
            int s = LOOKUP_FIRSTBLANK( comp0 );
            if ((count[depth] += s) > maxlength)   goto up; /* no spaces left */
            /*EXTEND_MARK(lev, s); *//* shift by s (LIST to the right, COMP to the left) */
            /* s is guaranteed to be < 32 !!! */
            {
                vector unsigned int Vs;
                vector unsigned int Vm;
                vector unsigned int Vss;
                VU.U[3] = s;
                Vs = vec_splat(VU.V,3);
                Vm = vec_sl(ONES,Vs);
                Vss = vec_sub(ZEROS,Vs);
                comp[depth][0] = vec_rl(comp[depth][0],Vs);
                comp[depth][1] = vec_rl(comp[depth][1],Vs);
                comp[depth][0] = vec_sel(vec_sld(comp[depth][0],comp[depth][1],4),comp[depth][0],Vm);
                comp[depth][1] = vec_sel(vec_sld(comp[depth][1],ZEROS,4),comp[depth][1],Vm);
                list[depth][1] = vec_sel(vec_sld(list[depth][0],list[depth][1],12),list[depth][1],Vm);
                list[depth][0] = vec_sel(vec_sld(ZEROS,list[depth][0],12),list[depth][0],Vm);
                list[depth][1] = vec_rl(list[depth][1],Vss);
                list[depth][0] = vec_rl(list[depth][0],Vss);
               }
        } else { /* s>=32 */
            if ((count[depth] += 32) > maxlength)  goto up; /* no spaces left */
            /*EXTEND_MARK_BY_WORD(lev); *//* shift by 32 (LIST to the right, COMP to the left) */
            {
                comp[depth][0] = vec_sld(comp[depth][0], comp[depth][1], 4);
                comp[depth][1] = vec_sld(comp[depth][1], ZEROS, 4);
                list[depth][1] = vec_sld(list[depth][0], list[depth][1], 12);
                list[depth][0] = vec_sld(ZEROS, list[depth][0], 12);
            }
            if (comp0 == 0xffffffff)    goto stay;
        }
}
        /* did we find a ruler with the desired number of marks? */
        if (depth == (maxmarks)) {
            int i;
            printf("FOUND A RULER!!!\n");
            for(i=0; i<=maxmarks; i++ ) printf("%4d",count[i]);
            printf("\n");
            continue; /* go back to the start of our while(1) loop*/
        } else {
        /* if we didnt distribute all marks we need to go deeper...*/
        
            /* prepare LIST[D+1]=LIST+(new_mark)*/
            int d = count[depth]-count[depth-1];
            
            //VU.V = ZEROS;
            union {
                vector unsigned int V;
                unsigned int U[4];
            } VU;
            VU.U[3] = 0;
            VU.V = vec_splat(VU.V,3);
            
            if (d <= 128) {
                if( d <= 32 ) {
                    VU.U[0]=BITOFLIST( d );
                } else if( d <= 64 ) { 
                    VU.U[1]=BITOFLIST( d );
                } else if( d <= 96 ) {
                    VU.U[2]=BITOFLIST( d );
                } else {  /* d <= 128 */
                    VU.U[3]=BITOFLIST( d );
                }
                list[depth+1][0] = vec_or(list[depth][0],VU.V); /* vec_add should work too, faster?? */
                list[depth+1][1] = list[depth][1];
            } else { /* d<=256 */
                if( d <= 160 ) {
                    VU.U[0]=BITOFLIST( d );
                } else if( d <= 192 ) { 
                    VU.U[1]=BITOFLIST( d );
                } else if( d <= 224 ) {
                    VU.U[2]=BITOFLIST( d );
                } else {  /* d <= 256 */
                    VU.U[3]=BITOFLIST( d );
                }
                list[depth+1][0] = list[depth][0];
                list[depth+1][1] = vec_or(list[depth][1],VU.V); /* vec_add should work too, faster?? */
            }
            
            /* prepare the new DIST[D+1]=DIST|LIST[D+1]*/
            dist[depth+1][0] = vec_or(list[depth+1][0],dist[depth][0]);
            dist[depth+1][1] = vec_or(list[depth+1][1],dist[depth][1]);

            /* prepare COMP[D+1]=COMP|DIST[D+1]*/
            comp[depth+1][0] = vec_or(dist[depth+1][0],comp[depth][0]);
            comp[depth+1][1] = vec_or(dist[depth+1][1],comp[depth][1]);
            
            count[depth+1] = count[depth]; /* pass on our current length*/
            depth++;			   /* one level deeper*/

            continue; /* go back to the start of our while(1) loop*/
        }
        
    up:
        if (--depth == 0) return nodes;/* EXIT if we reached end of search space */
    }

    return nodes;
}

#ifndef AS_MODULE

#include <sys/time.h>
#include <sys/resource.h>

double dtime()
{
    struct rusage rusage;
    double q;

    getrusage(RUSAGE_SELF,&rusage);

    q = (double)(rusage.ru_utime.tv_sec);
    q = q + (double)(rusage.ru_utime.tv_usec) * 1.0e-06;
	
    return q;
}

int main (int argc, const char * argv[])
{
    double starttime, benchtime;
    unsigned int nodes;
    
    printf("\nvecGARSP started...\n");
    
    starttime = dtime();
    nodes = vecGARSP(13,127);
    benchtime = dtime() - starttime;
    
    printf("Nodes per second = %.3f ( %u nodes/ %.3f seconds)\n",nodes/benchtime,nodes,benchtime);    
    
    return 0;
}

#endif
