#ifdef SPLINES
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "../binary_c_prototypes.h"
#include "../binary_c_macros.h"
#include "../binary_c_code_options.h"


/*
 * A function to perform n-dimensional linear interpolation on a table
 * of data.
 *
 * The table should be organised (in memory) like this
 *
 * p1[0] p2[0] p3[0] ... d1[0] d2[0] d3[0] d4[0] ...  
 * p1[1] p2[1] p3[1] ... d1[1] d2[1] d3[1] d4[1] ...  
 * p1[2] p2[2] p3[2] ... d1[2] d2[2] d3[2] d4[2] ...  
 *
 * so there are n parameters (p1, p2, p3...) and d data items
 * per data line (d1, d2, d3, d4 ...). There are l lines of data.
 *
 * The parameters should be ordered from low to high values. 
 * The parameters should be on a *constant* grid (which does NOT
 * need to be regular).
 *
 * What does this mean?
 *
 * This is a good data table:
 *
 * 0.1 -100 10 ...data...
 * 0.1 -100 25 ...data...
 * 0.1 -100 30 ...data...
 * 0.1  -50 10 ...data...
 * 0.1  -50 25 ...data...
 * 0.1  -50 30 ...data...
 * 0.3 -100 10 ...data...
 * 0.3 -100 25 ...data...
 * 0.3 -100 30 ...data...
 * 0.3  -50 10 ...data...
 * 0.3  -50 25 ...data...
 * 0.3  -50 30 ...data...  
 * 0.9 -100 10 ...data...
 * 0.9 -100 25 ...data...
 * 0.9 -100 30 ...data...
 * 0.9  -50 10 ...data...
 * 0.9  -50 25 ...data...
 * 0.9  -50 30 ...data...  
 *
 * In the above case, the parameters have values:
 * p0 : 0.1,0.3,0.9
 * p1 : -100, -50
 * p2 : 10,25,30
 *
 * The parameter "hypercube" then is the 3D-hypercube of
 * 3 * 2 * 3 = 18 data lines.
 * 
 * Note that the points on the cube are *constant* but not *regular*,
 * e.g. p0 has spacing (0.3-0.1)=0.2 and (0.9-0.3)=0.6, which are different,
 * BUT e.g. the spacing for p1 (-100 - -50 = -50) is the same, whatever the
 * value of p0. The same is true for p3 which always has spacings 15 and 5
 * from (25-10) and (30-25) respectively.
 *
 * Note that the table is assumed to be sorted from SMALLEST
 * to LARGEST parameter values. It is also assumed to be regular and filled.
 * So no missing data please, just put some dummy values in the table.
 *
 * In order to interpolate data, n parameters are passed into this 
 * routine in the array x. The result of the interpolation is put
 * into the array r (of size d).  
 *
 * If you enable INTERPOLATE_CACHE then results are cached to avoid
 * slowing the code too much. (This is set in binary_c_code_options.h) This means
 * that the interpolate routine checks your input parameters x against
 * the last few sets of parameters passed in. If you have used these x
 * recently, the result is simply returned. This saves extra interpolation
 * and is often faster.  This is only true in some cases of course - if your
 * x are always different you waste time checking the cache. This is why
 * the cache_hint variable exists: if this is false then the cache is skipped.
 * Of course only *you* know if you are likely to call the interpolate routine
 * repeated with the same values... I cannot possibly know this in advance!
 *
 * The interpolation process involved finding the lines of the data table
 * which span each parameter x. This makes a hypercube of length 2^n (e.g.
 * in the above it is 8, for simple 1D linear interpolation it would be the
 * two spanning values). Linear interpolation is then done in the largest
 * dimension, above this is the third parameter (p2), then the second (p1)
 * and finally on the remaining variable (p0). This would reduce the table
 * from 2^3 lines to 2^2 to 2^1 to (finally) 2^0 i.e. one line which is the 
 * interpolation result.
 *
 * To find the spanning lines a binary search is performed. This code was
 * donated by Evert Glebbeek. See e.g. 
 * http://en.wikipedia.org/wiki/Binary_search_algorithm
 * and note the comment "Although the basic idea of binary search is comparatively 
 * straightforward, the details can be surprisingly tricky... " haha! :)
 *
 * Each table has its own unique table_id number. This is just to allow
 * us to set up caches (one per table) and save arrays such as varcount and
 * steps (see below) so they only have to be calculated once.
 *
 * I have optimized this as best I can, please let me know if
 * you can squeeze any more speed out of the function.
 * I am sorry this has made most of the function unreadable! The 
 * comments should help, but you will need to know some tricks...
 *
 * Certain variables deserve extra mention:
 * 
 * varcount
 *   This is the number of unique values of a given parameters. In the above
 * example, this would be varcount[0]=3, varcount[1]=2, varcount[2]=3
 *
 * steps
 *   This is the number of lines of the table there are before a
 * parameter changes. In the above example this would be steps[0]=6,
 * steps[1]=3, steps[2]=1
 *
 * Both varcount and steps are cached for each table.
 *
 * interpolate_table
 *   This is the initially constructed hypercube which is reduced
 * in dimension (e.g. in the above from 3 to 2 to 1) until final 
 * interpolation is done.
 *
 * INTERPOLATE_CACHE_LENGTH
 *   The length of the table. Should be >= the number of tables you 
 *   are using. Usually 5 is good, less means faster cache compares
 *   but you're less likely to match!
 *
 * Note: uses Fequal macros to check for floating-point equality, you should
 * check that TINY is small enough for your parameters, or rescale your
 * parameters so they are much bigger than TINY (that's best, 
 * as TINY may be used used elsewhere). You could, of course, define a 
 * different (slower) macro.
 *
 * Rob Izzard, 2005-2010, please send bug fixes!
 */

/* NEWCACHE should be faster but often isn't : needs work */
#define NEWCACHE

/* enable malloc/calloc checks : done once, should be fast */
#define ALLOC_CHECKS

/* enable debugging output (lots of output!) */
#define INTERPOLATE_DEBUG

// enable this to show the whole table
#define INTERPOLATE_DEBUG_SHOW_TABLE

/*
 * Either use realloc for memory allocation, or a fresh malloc
 * each time. Test which is faster on your machine, and use it.
 * On all machines I have tested, realloc is faster.
 */
#define INTERPOLATE_USE_REALLOC

/*
 * Check for memory leaks (bad things!)
 */
//#define MEMLEAK_CHECKS


#ifdef MEMLEAK_CHECKS
#define INTERPOLATE_MALLOC(A) memleak_check_malloc(A)
#define INTERPOLATE_CALLOC(A,B) memleak_check_calloc(A,B)
#define INTERPOLATE_REALLOC(A,B) memleak_check_realloc(A,B)
#define INTERPOLATE_FREE(A) memleak_check_free(A)
void *memleak_check_calloc(size_t nmemb, size_t size);
void *memleak_check_malloc(size_t size);
void memleak_check_free(void *ptr);
void *memleak_check_realloc(void *ptr, size_t size);
static THREAD_LOCAL size_t interpolate_mem_allocated=0;
#else
#define INTERPOLATE_MALLOC(A) malloc(A)
#define INTERPOLATE_CALLOC(A,B) calloc(A,B)
#define INTERPOLATE_REALLOC(A,B) realloc(A,B)
#endif


/* use a smaller TINY (cannot remember why!) */
#undef TINY
#define TINY 1e-20

#ifdef INTERPOLATE_DEBUG
#define FLUSH fflush(stdout);
#else
#define FLUSH /* */
#endif

#ifdef NEWCACHE
/* 
 * with the new cache we use macros to
 * access cache items, rather than accessing 
 * them directly 
 */

/* length of a line in the cache */
#define INTERPOLATE_CACHE_LINE (n+d)

/* pointer to cache result A */
#define INTERPOLATE_CACHE_PARAM(A) (cache+INTERPOLATE_CACHE_LINE*(A))

/* pointer to the location of cache result A */
#define INTERPOLATE_CACHE_RESULT(A) (cache+INTERPOLATE_CACHE_LINE*(A)+n)

/* memcpy is usually faster for copying interpolation results to the cache */
#define NEWCACHE_MEMCPY

#endif


void interpolate_cubic_spline(const double * RESTRICT table, // (pointer to) the data table
                              const int n, // the number of parameters (i.e. dimensions)
                              const int d, // the number of data items
                              const int l, // the number of lines of data
                              const double * RESTRICT x, // the values of the parameters
                              double * RESTRICT r,  // the result of the interpolation
                              const unsigned int table_id, // table identifier
                              const Boolean cache_hint // tells us to use the cache, or not
    )
{

    double  u,v; // local temporary variables

#ifdef INTERPOLATE_DEBUG
    u=0.0; // prevent compiler warning
#endif


#ifdef USE_TMPSTORE

#define int_table stardata->tmpstore->interpolate_table
#define table_ids stardata->tmpstore->interpolate_table_ids
#define f stardata->tmpstore->interpolate_f
#define cache_spin_line stardata->tmpstore->interpolate_cache_spin_line
#define results_array stardata->tmpstore->interpolate_results_array
#define number_of_interpolation_tables stardata->tmpstore->number_of_interpolation_tables
#define steps_array stardata->tmpstore->interpolate_steps_array
#define varcount_array stardata->tmpstore->interpolate_varcount_array
#define cache_match_line stardata->tmpstore->interpolate_cache_match_line
#define cache_array stardata->tmpstore->interpolate_cache_array

    
#else
    
    
    // interpolate_table = the interpolation table (i.e. the hypercube)
#ifdef INTERPOLATE_USE_REALLOC
    static THREAD_LOCAL
#endif
        double * RESTRICT interpolate_table=NULL;
  
    // f = hypercube limits
#ifdef INTERPOLATE_USE_REALLOC
    static THREAD_LOCAL
#endif  
        double * RESTRICT  f=NULL;

#ifdef INTERPOLATE_CACHE
    static THREAD_LOCAL double ** cache_array=NULL;
    static THREAD_LOCAL int cache_match_line[NUMBER_OF_INTERPOLATION_TABLES];
#ifdef NEWCACHE
    static THREAD_LOCAL int cache_spin_line[NUMBER_OF_INTERPOLATION_TABLES];
#else
    static THREAD_LOCAL double ** results_array=NULL;
#endif
#endif
//INTERPOLATE_CACHE


    /* arrays of arrays for storing the cached steps and varcount */ 
    static THREAD_LOCAL UNSIGNED int ** RESTRICT steps_array=NULL;
    static THREAD_LOCAL UNSIGNED int ** RESTRICT varcount_array=NULL;

    
#endif // USE_TMPSTORE *******************************************
    
#ifdef INTERPOLATE_CACHE
    /* variables for storing the cached data */
    double * cache=NULL;
    double * results=NULL;
#endif //INTERPOLATE_CACHE

    /* define UNSIGNED to actually be unsigned : it should be better
     * for large numbers and the ints really are >=0 */
#define UNSIGNED unsigned
//#define UNSIGNED /**/
   
    /* local steps and varcount (see above for a description) */
    UNSIGNED int * RESTRICT steps;
    UNSIGNED int * RESTRICT varcount;
    UNSIGNED int * RESTRICT sum=NULL;

    // more temporary variables (e.g. binary search admin etc)
    UNSIGNED int  a,b,c,setcount=0;
    // counters which are used a lot but calculated here just once
    const UNSIGNED int lnl=n+d;
    const UNSIGNED int dd=sizeof(double)*d;
    const UNSIGNED int nd=sizeof(double)*n;
    const UNSIGNED int lnl_sizeof=lnl*sizeof(double);
    const UNSIGNED int max_itc_size=Power_of_integer(3,n);
    const UNSIGNED int max_itc_sizeof=max_itc_size*sizeof(UNSIGNED int);
    UNSIGNED int  g,i=0,j,k,dim=nd;
    const int n1=n-1;
    const UNSIGNED int table_len=lnl*l;



    printf("DEBUG INTERPOLATE dd=%d nd=%d lnl=%d lnl_sizeof=%d max_itc_size=%d max_itc_sizeof=%d table_len=%d\n",
           dd,nd,lnl,lnl_sizeof,max_itc_size,max_itc_sizeof,table_len);


    /*
     * If table is NULL we should free all allocated memory and just return
     */
    if(table==NULL)
    {
        if(steps_array!=NULL)
        {
            for(g=0;g<NUMBER_OF_INTERPOLATION_TABLES;g++)
            {
#ifdef INTERPOLATE_CACHE
                Safe_free(cache_array[g]);
#endif // INTERPOLATE_CACHE
                Safe_free(varcount_array[g]);
                Safe_free(steps_array[g]);
            }
#ifdef INTERPOLATE_CACHE
            Safe_free(cache_array);
#endif
            Safe_free(varcount_array);
        }
#ifdef INTERPOLATE_USE_REALLOC
        Safe_free(interpolate_table);
        Safe_free(sum);
#endif
        return;
    }

#ifdef INTERPOLATE_DEBUG
    printf("Interpolate %d lines of data (%d data items, %d free parameters, lnl=%d) table_id=%d, cache_hint=%d\n",l,d,n,lnl,table_id,cache_hint);
#endif

#ifdef INTERPOLATE_DEBUG_SHOW_TABLE
    for(i=0;i<l;i++)
    {
        printf("L%d ",i);
        for(j=0;j<lnl;j++)
        {
            printf("%g ",*(table+i*lnl+j));
        }
        printf("\n");
        FLUSH;
    }
#endif

#ifdef INTERPOLATE_DEBUG
#ifdef NEWCACHE
    printf("Interpolate: check cache (table_id=%d, cache_array=%p)\n",table_id,cache_array);
#else
    printf("Interpolate: check cache (table_id=%d, cache_array=%p,results_array=%p)\n",
           table_id,cache_array,results_array);
#endif
    FLUSH;
#endif

#ifdef INTERPOLATE_CACHE  

    /*
     * First time through: set up the cache memory space for the 
     * arrays of pointers
     */
    if(cache_array==NULL) 
    {
        // cache_array holds pointers to the cache for each table
        // results_array ditto for the results
        cache_array=INTERPOLATE_CALLOC(2*NUMBER_OF_INTERPOLATION_TABLES,
                                       sizeof(double*));
#ifndef NEWCACHE
        results_array=cache_array+NUMBER_OF_INTERPOLATION_TABLES;
#endif

#ifdef ALLOC_CHECKS
        if((cache_array==NULL)
#ifndef NEWCACHE
           ||(results_array==NULL)
#endif
            )
        {
            Exit_binary_c(BINARY_C_CALLOC_FAILED_IN_INTERPOLATE,"failed to alloc\n");
        }
#endif
    }


    if(cache_array[table_id]==NULL)
    {
        cache_match_line[table_id]=0;
#ifdef NEWCACHE
        cache_spin_line[table_id]=-1;
#endif
#ifdef INTERPOLATE_DEBUG
        printf("Allocated new cache array for table_id=%d\n",table_id);
#endif
        /*
         * Allocate cache space for this interpolation table
         */
        cache_array[table_id]=
            INTERPOLATE_CALLOC(lnl*INTERPOLATE_CACHE_LENGTH,sizeof(double));
        cache=cache_array[table_id];

#ifndef NEWCACHE
        results_array[table_id] = cache + 
            n*INTERPOLATE_CACHE_LENGTH;
        results = results_array[table_id];
#endif //not NEWCACHE


#ifdef ALLOC_CHECKS
        if((cache==NULL)
#ifndef NEWCACHE
           ||(results==NULL)
#endif
            )
        {
            Exit_binary_c(BINARY_C_CALLOC_FAILED_IN_INTERPOLATE,"Failed to alloc cache_array \n");
        }
#endif
    }
    else if(cache_hint>0)
    {
        /*
         * Check the cache for the current parameters
         */
        cache=cache_array[table_id];

#ifndef NEWCACHE
        results=results_array[table_id];
#endif // not NEWCACHE

        /*
         * if no cache was previously saved
         * there is no point trying 
         */
#ifdef NEWCACHE
        if(cache_spin_line[table_id] != -1)
#endif
        {         
            /*
             * Start the loop  
             */
            int imax=cache_match_line[table_id]+
                INTERPOLATE_CACHE_LENGTH;
            int iloop,iline;
            for(iloop=cache_match_line[table_id];
                iloop<imax;
                iloop++)
            {
                iline=iloop%INTERPOLATE_CACHE_LENGTH;

                /*
                 * Which is quicker, using MEMCMP or using a direct comparison?
                 * MEMCMP seems slower than a direct array-based approach!
                 */

                //#define USE_MEMCMP
#ifdef USE_MEMCMP
#ifdef NEWCACHE
                if(memcmp(INTERPOLATE_CACHE_PARAM(iline),x,dim)==0)
#else
                    if(memcmp(cache+iline*n,x,dim)==0)
#endif // NEWCACHE

#else // USE_MEMCMP

                        Boolean match=TRUE;
                int m;
                double *cacheline;
#ifdef NEWCACHE
                cacheline=INTERPOLATE_CACHE_PARAM(iline);
#else
                cacheline=cache+iline*n;
#endif // NEWCACHE

                for(m=0;m<n;m++)
                {
                    if(!Fequal(x[m],cacheline[m]))
                    {
                        match=FALSE;
                        break;
                    }
                }
                if(match==TRUE)
#endif // USE_MEMCMP

                {
                    /* 
                     * cache matches at line iline so
                     * set the interpolation result directly from
                     * the cache
                     */
#ifdef NEWCACHE
                    memcpy(r,INTERPOLATE_CACHE_RESULT(iline),dd);
#else
                    memcpy(r,results+iline*d,dd);
#endif //NEWCACHE
                    
                    //if(iloop!=0)printf("cache match at line iline=%d (iloop=%d loop start %d)\n",iline,iloop,cache_match_line[table_id]);
                    
                    /* 
                     * Save the position of the match for 
                     * next time so we start searching at the match
                     * rather than the beginning of the cache
                     */
                    cache_match_line[table_id]=iline;
                    
                    /* skip everything else */
                    goto cache_match;
                }  
            }
        }
    }
#endif // INTERPOLATE_CACHE

    /*
     * Result is not cached, or we did not want to search the cache,
     * we must calculate it!
     * Setup / (re)allocate memory
     */
#ifdef INTERPOLATE_DEBUG
    printf("Interpolate: memory allocation\n");FLUSH;
#endif
#ifdef INTERPOLATE_USE_REALLOC
    /* realloc a single piece of memory */
    interpolate_table=INTERPOLATE_REALLOC(interpolate_table,2*max_itc_size*lnl_sizeof+
                                  dim+
                                  max_itc_sizeof);
    f=interpolate_table+2*max_itc_size*lnl;
    sum=(UNSIGNED int*)(f+n);

    /* always clear sum */
    memset(sum,0,max_itc_sizeof);
#else
    /* manually malloc/calloc each time */
    interpolate_table=INTERPOLATE_MALLOC(2*max_itc_size*lnl_sizeof);
    f=INTERPOLATE_MALLOC(dim);
    sum=INTERPOLATE_CALLOC(1,max_itc_sizeof);
#endif

  
#ifdef INTERPOLATE_DEBUG
    printf("interpolate_table alloc 2 * %d * %d\n",max_itc_size,lnl_sizeof);
#endif

    if(varcount_array==NULL)
    {
        /* set up an array of pointers to arrays containing the varcounts */
        /* and the variable steps */

        varcount_array=INTERPOLATE_CALLOC(2*NUMBER_OF_INTERPOLATION_TABLES,sizeof(UNSIGNED int*));
        steps_array=varcount_array+NUMBER_OF_INTERPOLATION_TABLES; 

    }


#ifdef ALLOC_CHECKS
    if((varcount_array==NULL)||(f==NULL)||(sum==NULL)||(interpolate_table==NULL)||(steps_array==NULL))
    {
        Exit_binary_c(BINARY_C_CALLOC_FAILED_IN_INTERPOLATE,
                      "Error allocating varcount in interpolate\n");
    }
#endif

    // set these, they're used a lot!
    k=lnl*l; g=k-lnl;

    if(steps_array[table_id]==NULL)
    {
        /*
         * First time with this table: Find the variable steps
         */
        steps=steps_array[table_id]=INTERPOLATE_CALLOC(n,sizeof(UNSIGNED int)); 

#ifdef ALLOC_CHECKS
        if(steps==NULL)
        {
            Exit_binary_c(BINARY_C_CALLOC_FAILED_IN_INTERPOLATE,
                          "(m|c)alloc failed in interpolate() : steps/steps_array\n");
        }
#endif

        /* loop, find where variables change */
#ifdef INTERPOLATE_DEBUG
        printf("Find where vars change (i.e. set steps_array/steps) loop to k=%d\n",k);
#endif
        setcount=0;
        for(i=0;i<k;i++)
        {
            a=i*lnl;
            b=table_len-a-1;
#ifdef INTERPOLATE_DEBUG
            printf("Line %d : ",i);
#endif
            for(j=0;j<n-setcount;j++) // j<n-setcount because we'll set variable
                // n, then n-1 etc... all the way to 0. Seems to work!
            {
#ifdef INTERPOLATE_DEBUG
                printf("Var %d (step=%d) is %g next %g ; ",j,steps[j],table[j],table[a+j]);
#endif
                /*
                 *check for table runoff: this happens if the first
                 * variable does not change
                 */
                if((j>b)||((steps[j]==0)&&(!Fequal(table[j],table[a+j]))))
                {
                    steps[j]=i;
                    setcount++;
#ifdef INTERPOATE_DEBUG
                    if(j>b)
                    {
                        printf("Var %d : table runoff\n",j);
                    }
                    else
                    {
                        printf("Var %d changes\n",j);
                    }
#endif
                    /*
                     * check if setcount > num variables i.e. if we are done 
                     */
                    if(setcount>n1)
                    {
                        i=k; // break from outer loop too
                        break;
                    }
                } 
            }
#ifdef INTERPOLATE_DEBUG
            printf("\n");
#endif
        }
    }
    else
    {
        steps=steps_array[table_id]; // short cut
#ifdef INTERPOLATE_DEBUG
        printf("We already know steps[%d] (0=%d)\n",table_id,steps[0]);
#endif
    }

    /*
     * We have to count how many different values each parameter can take.
     */
    if(varcount_array[table_id]==NULL)
    {
        varcount=varcount_array[table_id]=INTERPOLATE_CALLOC(n,sizeof(UNSIGNED int));
#ifdef ALLOC_CHECKS
        if(varcount==NULL)
        {
            Exit_binary_c(BINARY_C_MALLOC_FAILED,
                          "Error allocating varcount %d in interpolate\n",
                          (int)table_id);
        }
#endif      
        /* fast counting method! */
        b=l;
        for(j=0;j<n;j++)
        {
#ifdef INTERPOLATE_DEBUG
            printf("Interpolate debug: varcount[%d]=%d, b=%d, steps[%d]=%d\n",
                   j,varcount[j],b,j,steps[j]);
            FLUSH;
#endif
            varcount[j]=b/steps[j];
            b=steps[j];
        }
    }
    else
    {
        varcount=varcount_array[table_id];
    }

    /*
     * Find the limits of the parameters (assumes rectangular, sorted small->large)
     */
#ifdef INTERPOLATE_DEBUG
    printf("find limits\n");FLUSH;
#endif
   
    k=lnl*l;
    g=lnl*(l-1);

    for(j=0;j<n;j++)
    {
        if(
    
            }

    
        for(j=0;j<n;j++)
        {
            /*
             * limit the value of our given parameter x[j] to the range we have 
             * and save to the parameter v
             */
#ifdef INTERPOLATE_DEBUG
            printf("Find limit %d from x=%g min=%g max=%g\n",j,x[j],table[j],table[j+g]);FLUSH;
#endif

            v=table[g+j];
            printf("INITIAL VALUE %g\n",v);
            v=Max(table[j],v);
            v=Min(v,x[j]);
            k=steps[j];

#ifdef INTERPOLATE_DEBUG
            printf("Construct variable %d hypertable position (v=%g from %g < x[%d]=%g < %g)\n",j,v,
                   table[j],j,
                   x[j],table[g+j]);
            FLUSH;
            if(table[g+j]<x[j])
            {
                printf("WARNING data is in the wrong order! %g < %g\n",table[g+j],x[j]);
            }
            FLUSH;
#endif
            /*
             * Now we can guess the parameter value appropriate for us:
             * a and b are the binary search limits, start at a=0
             * and b=varcount[j] (the max possible value)
             */
            a=0;
            b=varcount[j];
            if(b>1)
            {
                /*
                 * Binary search blatantly stolen from Evert Glebbeek's code
                 */
                i=lnl*steps[j];
                while(b>a+1)
                {
                    c=(a+b)/2;
                    if(v>table[c*i+j]) a=c; // u=table[c*i+j]
                    else b=c; // if(Less_or_equal(v,u)) // obviously!
                }
          
                // caclulate interpolation factor (nasty, sorry...)
                u=table[a*i+j];
                f[j]=(v-u)/(table[b*i+j]-u);
            }
            else
            {
                f[j]=0.0; // only one value to choose from! (the low value)
            }

            /*
             * Binary search says the lookup value is between a and b
             * Which is closer?
             */ 
            int closest;
            printf("Binary search span : %d (at %g) to %d (at %g)\n",
                   a,table[a*i+j],b,table[b*i+j]);
            printf("closest to %g = %d (%g)\n",v,closest,table[closest*i+j]);
        
            /* hence we should span closest-1 to closest+1,
             * within the table limits */
            a= Max(0,closest-1);
 
            if(a==0)
            {
                b=2; 
            }
            else if(a==l-1)
            {
                b=l-3;
            }
            else
            {
                b=Min(l-1,closest+1);
            }

            printf("cubic span variable %d : %d to %d\n",j,a,b);
        
#ifdef INTERPOLATE_DEBUG
            printf("f[%d]=%g = (%g-%g)/(%g-%g)\n",j,f[j],v,u,table[b*i+j],u);
#endif

            c=Integer_power_of_two(n1-j); // reuse variable c 
      
            for(i=0;i<max_itc_size;i++)
            {
                /*
                 * (i&c)/c is 0 for a, 1 for b
                 * in fact, for line 1 it makes (for n=3) a table
                 * 0 0 0
                 * 0 0 1
                 * 0 1 0
                 * 0 1 1
                 * 1 0 0
                 * 1 0 1
                 * 1 1 0
                 * 1 1 1
                 * as required! 
                 */

                /*
                 * Add up coordinate of the line: this is the number
                 * of the line in table, which will later be set in
                 * interpolate_table.
                 *
                 * NB this used to be an if((i&c)/c==0) but of course
                 * if i&c/c==0 then i&c==0 as well. Then why have an 
                 * if at all? Just use the result of the comparison
                 * without any branching.
                 */
                int tmp = ((i&c)==0);
                sum[i] += (UNSIGNED int)tmp*a*k+(1-tmp)*b*k;
            }
        }

        /* watch for table overrun */
        for(i=0;i<max_itc_size;i++)
        {
            if(sum[i]>l) sum[i] = sum[i]%l;
        }

#ifdef INTERPOLATE_DEBUG
        printf("Parameter (x) values: ");
        for(j=0;j<n;j++)
        {
            printf("% 3.3e ",x[j]);
        }
        printf("\n");

        printf("Interpolation (f) factors: ");
        for(j=0;j<n;j++)
        {
            printf("% 3.3e ",f[j]);
        }
        printf("\n");
  
        printf("Interpolation hypertable:\n");
#endif

        /* Construct hypertable lines from coordinates */
        k=0;

        /* easily vectorized loop */
        for(i=0;i<max_itc_size;i++)
        {
            sum[i] *= lnl;
        }

        for(i=0;i<max_itc_size;i++)
        {
#ifdef INTERPOLATE_DEBUG
            printf("memcpy %p to %p (<%p %d) %p %d\n",
                   interpolate_table+i*lnl,
                   interpolate_table+i*lnl+lnl_sizeof,
                   interpolate_table+2*max_itc_size*lnl_sizeof,
                   interpolate_table+i*lnl+lnl_sizeof < interpolate_table+2*max_itc_size*lnl_sizeof,
                   table+sum[i],
                   lnl_sizeof);
            printf("Interpolation (f=%p to %p) factors: ",f,f+dim);
            for(j=0;j<n;j++)
            {
                printf("% 3.3e ",f[j]);
            }
            printf("\n");
#endif

            // we have the coords, set the data in the interpolate_table
            memcpy(interpolate_table+k,table+sum[i],lnl_sizeof);
      
            // NB += avoids interpolate_table+i*lnl i.e. multiply
            k+=lnl; 

#ifdef INTERPOLATE_DEBUG
            printf("Line %d : ",i);FLUSH;
            for(j=0;j<n;j++)
            {
                printf("% 3.3e ",*(interpolate_table+i*lnl+j));FLUSH;
            }
            printf(" | ");FLUSH;
            for(j=n;j<lnl;j++)
            {
                printf("% 3.3e ",*(interpolate_table+i*lnl+j));FLUSH;
            }
            printf(" %d/%d\n",i,max_itc_size-1);FLUSH;
#endif 
        }

#ifdef INTERPOLATE_DEBUG
        printf("done hypertable\n");
        printf("Interpolation (f) factors: ");
        for(j=0;j<n;j++)
        {
            printf("% 3.3e ",f[j]);
        }
        printf("\n");
#endif

        /*
         * Do the interpolation
         */
    
        dim=0;
        g=lnl<<n1;

        {
            double *int_table_n=interpolate_table+n;
            while(dim<n)
            {

#ifdef INTERPOLATE_DEBUG
                printf("Interpolate dim=%d f=%g\n",dim,f[dim]);
                FLUSH;
#endif

                v=f[dim];   


                if(v>TINY)
                {
                    if(v+TINY>1.0)
                    {
                        // u=0 v=1: unusual case but easy to calculate (no inner loop required)
                        double *xxx;
                        for(i=0;i<g;i+=lnl)
                        {
                            xxx=int_table_n+i;
                            memcpy(xxx,xxx+g,dd);
                        }
                    }
                    else
                    {
                        /* 
                         * intermediate cases : the most common, so the most
                         * optimized!
                         */
                        u=1.0-v;

                        /* either loop over j or k, but k has fewer
                         * additions, so should be faster */
                        for(i=0;i<g;i+=lnl)
                        {
                            int kmax=i+lnl;
                            for(k=i+n;k<kmax;k++)
                            {
                                interpolate_table[k]=u*interpolate_table[k]+v*interpolate_table[k+g];
                            }
                        }
                    }
                }
                // else v=0, interpolate_table[k] stays the same
                dim++;
                g/=2;
            }

#ifdef INTERPOLATE_DEBUG
            printf("memcopy results\n");FLUSH;
#endif

            /*
             * Set the result array
             */
            memcpy(r,int_table_n,dd);
    
        }

#ifdef INTERPOLATE_DEBUG
        printf("Result\n");
        for(j=0;j<n;j++)
        {
            printf("% 3.3e ",*(interpolate_table+j));
        }
        printf(" | ");
        for(j=n;j<lnl;j++)
        {
            printf("% 3.3e ",*(interpolate_table+j));
        }
        printf("\n");FLUSH;
#endif

#ifdef INTERPOLATE_CACHE
        /*
         * No cache match but interpolation done:
         *
         * Save the results of the interpolation into the cache
         * NB we shift the cache so that the most recent results 
         * are at the top   
         */
        if(cache_hint>0)
        {
#ifdef NEWCACHE
            /* use the next line of the cache */
            cache_spin_line[table_id]++;
            /* avoid falling off the end of the cache */
            cache_spin_line[table_id] = cache_spin_line[table_id]%INTERPOLATE_CACHE_LENGTH;
            /* insert data : NB memcpy is definitely faster than a loop */
            memcpy(INTERPOLATE_CACHE_PARAM(cache_spin_line[table_id]),x,nd);
            memcpy(INTERPOLATE_CACHE_RESULT(cache_spin_line[table_id]),r,dd);
#else
            /* shift cache */
            memmove(cache+n,cache,nd*(INTERPOLATE_CACHE_LENGTH-1));
            memmove(results+d,results,dd*(INTERPOLATE_CACHE_LENGTH-1));
            /* set the results in the interpolation cache */
            memcpy(cache,x,nd);
            memcpy(results,r,dd);
#endif
        }

    cache_match:
  
#endif // INTERPOLATE_CACHE

#ifndef INTERPOLATE_USE_REALLOC
        /* free alloc'd memory */
#ifdef INTERPOLATE_DEBUG
        _printf("Free interpolate_table=%p\n",interpolate_table);
        FLUSH;
#endif
        Safe_free(interpolate_table);
#ifdef INTERPOLATE_DEBUG
        _printf("Free f = %p\n",f);
        FLUSH;
#endif

#ifdef INTERPOLATE_DEBUG
        fprintf(stderr,"Free sum = %p\n",sum);
        FLUSH;
#endif
        Safe_free(sum);
#endif


        {
            int i;
            printf("Interpolate cubic spline in : ");
            for(i=0;i<n;i++)
            {
                printf("%g ",x[i]);
            }
            printf("\nOut : ");
            for(i=0;i<d;i++)
            {
                printf("%g ",r[i]);
            }
            printf("\n");
            Exit_binary_c(BINARY_C_NORMAL_EXIT,"");
        }

        return;

    }

#ifdef MEMLEAK_CHECKS
/*
 * wrappers for malloc, calloc, realloc and free to log
 * memory allocation
 */


    void *memleak_check_calloc(size_t nmemb, size_t size)
    {
        printf("Calloc nmemb %d, size %d\n",(int)nmemb,(int)size);
        return (calloc(nmemb,size));
    }

    void *memleak_check_malloc(size_t size)
    {
        printf("Malloc size %d\n",(int)size);
        return(malloc(size));
    }

    void memleak_check_free(void *ptr)
    {
        printf("Free %p\n",ptr);
        Safe_free(ptr);
    }

    void *memleak_check_realloc(void *ptr, size_t size)
    {
        printf("Realloc %p size %d\n",ptr,(int)size);
        return(realloc(ptr,size));
    }

#endif
#endif//SPLINES
