/*
mpcx_tree

high-level functions working with trees of polynomials

Copyright (C) 2009, 2010, 2011, 2012, 2020 Andreas Enge

This file is part of the MPFRCX Library.

The MPFRCX Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The MPFRCX Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the MPFRCX library; see the file COPYING.LESSER.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA.
*/

#include "mpfrcx-impl.h"

static void mpcx_evaltree (mpcx_tree_ptr t, mpcx_tree_srcptr subproducts,
   mpcx_srcptr f);
static void mpcx_multieval_fast (mpc_t *values, mpc_t *args, int no,
   mpcx_srcptr f);

/**************************************************************************/

void mpcx_tree_init (mpcx_tree_ptr t, int no, mpfr_prec_t prec) {
   /* initialises a binary tree with "no" leaves for storing polynomials  */
   /* of precision "prec"                                                 */
   /* All tree nodes are allocated and initialised.                       */

   int i, j, tmp;

   t->no = no;
   for (i = no, t->levels = 1; i > 1; i = (i+1) / 2, t->levels++);
   t->node = (mpcx_t **) malloc (t->levels * sizeof (mpcx_t *));
   t->width = (int *) malloc (t->levels * sizeof (int));

   for (i = 0, tmp = no; i < t->levels; i++, tmp = (tmp + 1) / 2) {
      t->width [i] = tmp;
      t->node [i] = (mpcx_t *) malloc (tmp * sizeof (mpcx_t));
      for (j = 0; j < tmp; j++)
         mpcx_init (t->node [i][j], 2, prec);
   }
}

/**************************************************************************/

void mpcx_tree_clear (mpcx_tree_ptr t) {
   /* clears t and all the polynomials on its nodes                       */

   int i, j;

   for (i = 0; i < t->levels; i++) {
      for (j = 0; j < t->width [i]; j++)
         mpcx_clear (t->node [i][j]);
      free (t->node [i]);
   }
   free (t->node);
   free (t->width);
}

/**************************************************************************/

void mpcx_tree_get_root (mpcx_ptr f, mpcx_tree_srcptr t) {
   /* assigns the root of t to f */

   mpcx_set (f, t->node [t->levels-1][0]);
}

/**************************************************************************/

void mpcx_subproducttree (mpcx_tree_ptr t, mpcx_t *factors) {
   /* takes an array of t->no polynomials and returns via t the tree      */
   /* containing their subproducts: res [0,j] contains factors [j], and   */
   /* res [i,j] contains res [i-1,2j] * res [i-1,2j+1].                   */
   /* Optimal (at least if all degrees are the same) version as found out */
   /* with Guillaume and Pierrick: always multiply two factors, sweeping  */
   /* the array from left to right                                        */
   /* t must be initialised with the correct number of leaves (as many as */
   /* there are entries in "factors") and precision (usually the common   */
   /* precision of all entries in "factors").                             */

   int i, j;

   /* construct the basic layer */
   for (j = 0; j < t->no; j++)
      mpcx_set (t->node [0][j], factors [j]);

   for (i = 1; i < t->levels; i++) {
      /* construct layer i */
      for (j = 0; j < t->width [i-1] / 2; j++)
         mpcx_mul (t->node [i][j],
                   t->node [i-1][2*j], t->node [i-1][2*j+1]);
      if (t->width [i-1] % 2 != 0)
         mpcx_set (t->node [i][t->width [i] - 1],
                   t->node [i-1][t->width [i-1] - 1]);
   }
}

/**************************************************************************/

static void mpcx_evaltree (mpcx_tree_ptr t, mpcx_tree_srcptr subproducts,
   mpcx_srcptr f) {
   /* second step in multipoint evaluation; takes a subproduct tree with  */
   /* "no" entries and a polynomial f, and returns in t the tree of the   */
   /* f mod subproducts [i][j], computed at the precision of f            */
   /* t must be initialised with the same number of leaves as the         */
   /* subproduct tree.                                                    */

   int      i, j;

   mpcx_rem (t->node [subproducts->levels-1][0],
             f, subproducts->node [t->levels-1][0]);
   for (i = t->levels - 2; i >= 0; i--)
      for (j = 0; j < t->width [i]; j++)
         mpcx_rem (t->node [i][j],
                   t->node [i+1][j/2], subproducts->node [i][j]);
}

/***************************************************************************/

static void mpcx_multieval_fast (mpc_t *values, mpc_t *args, int no,
                                 mpcx_srcptr f) {
   /* The internal version of the function. Always builds a subproduct    */
   /* tree independently of the degree of f and the number of arguments.  */

   mpcx_tree_t subproducts, evaltree;
   mpcx_t*  factors;
   int      i, j;

   factors = (mpcx_t *) malloc (no * sizeof (mpcx_t));
   for (i = 0; i < no; i++) {
      mpcx_init (factors [i], 2, f->prec);
      factors [i]->deg = 1;
      mpc_set_ui (factors [i]->coeff [1], 1, MPC_RNDNN);
      mpc_neg (factors [i]->coeff [0], args [i], MPC_RNDNN);
   }

   mpcx_tree_init (subproducts, no, factors [0]->prec);
   mpcx_subproducttree (subproducts, factors);

   mpcx_tree_init (evaltree, no, factors [0]->prec);
   mpcx_evaltree (evaltree, subproducts, f);

   for (j = 0; j < no; j++)
      mpc_set (values [j], evaltree->node [0][j]->coeff [0], MPC_RNDNN);

   mpcx_tree_clear (subproducts);
   mpcx_tree_clear (evaltree);

   for (i = 0; i < no; i++)
      mpcx_clear (factors [i]);
   free (factors);
}

/**************************************************************************/

void mpcx_multieval (mpc_t *values, mpc_t *args, int no, mpcx_srcptr f) {
   /* evaluates the polynomial f in the "no" arguments args and returns   */
   /* the results via values. values must contain sufficient space, and   */
   /* each entry must be initialised.                                     */
   /* values and args may point to the same array.                        */
   /* Works only if f and values have the same precision.                 */
   /* Splits the arguments into portions of about half the degree of f    */
   /* and calls the fast multipoint evaluation routine on the chunks.     */

   int chunks, size;
   int i;

   if (f->deg <= 1)
      chunks = no;
   else if (f->deg > 2 * no)
      chunks = 1;
   else
      chunks = (2 * no) / f->deg;
   size = no / chunks;

   for (i = 0; i < chunks - 1; i++)
      mpcx_multieval_fast (values + i*size, args + i*size, size, f);
   mpcx_multieval_fast (values + (chunks - 1)*size, args + (chunks - 1)*size,
         no - (chunks - 1) * size, f);
}

/***************************************************************************/

void mpcx_hecke (mpcx_ptr rop, mpcx_tree_srcptr subproducts, mpcx_t *vals) {
   /* Let factors [0],... be the leaves of subproducts, no their number,   */
   /* and f=\prod_{i=0}^{no-1} factors [i] the polynomial at the root of   */
   /* subproducts.                                                         */
   /* Returns \sum_{i=0}^{no-1} val [i]*f/factors [i], which is a          */
   /* simplified case of Lagrange interpolation and occurs when computing  */
   /* the Hecke representation of an algebraic number with conjugates      */
   /* val [i] in a number field defined by f.                              */

   mpcx_t *hecke_old, *hecke_new = NULL; /* silence a spurious gcc warning */
      /* two consecutive layers in the Hecke tree */
   mpcx_t tmp;
   const int length = 2;
   mpfr_prec_t prec;
   int i, j;

   prec = subproducts->node [0][0]->prec;
   mpcx_init (tmp, length, prec);

   hecke_old = (mpcx_t *) malloc (subproducts->no * sizeof (mpcx_t));
   for (j = 0; j < subproducts->no; j++)
      mpcx_init_set (hecke_old [j], vals [j]);

   for (i = 1; i < subproducts->levels; i++) {
      /* compute layer i */
      hecke_new = (mpcx_t *) malloc (subproducts->width [i] * sizeof (mpcx_t));
      for (j = 0; j < subproducts->width [i-1] / 2; j++) {
         mpcx_init (hecke_new [j], length, prec);
         mpcx_mul (hecke_new [j],
                   hecke_old [2*j], subproducts->node [i-1][2*j+1]);
         mpcx_mul (tmp,
                   hecke_old [2*j+1], subproducts->node [i-1][2*j]);
         mpcx_add (hecke_new [j], hecke_new [j], tmp);
      }
      if (subproducts->width [i-1] % 2 != 0)
         mpcx_init_set (hecke_new [subproducts->width [i] - 1],
                        hecke_old [subproducts->width [i-1] - 1]);

      /* clear layer i-1 */
      for (j = 0; j < subproducts->width [i-1]; j++)
         mpcx_clear (hecke_old [j]);
      free (hecke_old);

      /* swap */
      hecke_old = hecke_new;
         /* One could save a few mallocs and frees and reuse entries of
            hecke_old, but this is probably not worth it.               */
   }
   mpcx_clear (tmp);
   
   mpcx_set (rop, hecke_old [0]);
   mpcx_clear (hecke_old [0]);
   free (hecke_old);
}

/***************************************************************************/

void mpcx_product_and_hecke (mpcx_t *rop, mpcx_t **vals, int no_pols,
   int no_factors) {
   /* vals is an array of no_pols arrays with no_factors entries. The      */
   /* functions uses vals [0] to compute a subproduct tree, the root of    */
   /* which is returned in rop [0]. Then the Hecke representation for the  */
   /* vals [i] is computed for i>=1 and returned via rop [i].              */
   /* rop needs to have no_pols entries, all of which must be initialised. */
   /* The function is written with as few nested loops as possible, so     */
   /* that it requires more index arithmetic, but is easier to parallelise */
   /* via MPI.                                                             */

   const int length = 2;
   const mpfr_prec_t prec = vals [0][0]->prec;
   int level = 0, width = no_factors, width_new, firsthalf;
   int l, m, i, j, j1, j2, j3;
   mpcx_t *new, *old;
      /* two consecutive layers of the trees */
   mpcx_t tmp;

   mpcx_init (tmp, length, prec);
   old = (mpcx_t *) malloc (no_pols * width * sizeof (mpcx_t));
   for (i = 0; i < no_pols; i++)
      for (j = 0; j < width; j++)
         mpcx_init_set (old [i*width + j], vals [i][j]);

   while (width > 1) {
      /* compute new layer */
      level++;
      width_new = (width + 1) / 2;
      firsthalf = width / 2;

      /* initialise new layer */
      new = (mpcx_t *) malloc (no_pols * width_new * sizeof (mpcx_t));
      for (m = 0; m < no_pols * width_new; m++)
         mpcx_init (new [m], length, prec);

      /* carry out the multiplications */
      for (l = 0; l < (2 * no_pols - 1) * firsthalf; l++) {
         i = ((l / firsthalf) + 1) / 2;
         j = l - (i == 0 ? 0 : (2*i-1) * firsthalf);
         if (j < firsthalf) {
            j3 = j;
            j1 = 2*j;
            j2 = j1+1;
         }
         else {
            j3 = j-firsthalf;
            j2 = 2*(j-firsthalf);
            j1 = j2+1;
         }
         mpcx_mul (tmp, old [0*width+j1], old [i*width+j2]);
         mpcx_add (new [i*width_new+j3], new [i*width_new+j3], tmp);
            /* some of these additions actually add to the zero
               polynomial, but this allows to un-nest the loops */
      }

      /* copy odd (pun intended!) left-overs */
      if (width % 2 != 0)
         for (i = 0; i < no_pols; i++)
            mpcx_set (new [(i+1)*width_new-1], old [(i+1)*width-1]);

      /* clear old layer */
      for (m = 0; m < no_pols * width; m++)
         mpcx_clear (old [m]);
      free (old);

      /* swap */
      old = new;
      width = width_new;
   }

   for (i = 0; i < no_pols; i++) {
      mpcx_set (rop [i], old [i]);
      mpcx_clear (old [i]);
   }
   free (old);
   mpcx_clear (tmp);
}

/***************************************************************************/

void mpcx_reconstruct (mpcx_ptr rop, mpcx_t *factors, int no)
   /* takes an array of no polynomials and multiplies them all together   */
   /* into rop                                                            */

{
   int i, layer = 1;
   mpcx_t *op;

   op = (mpcx_t *) malloc (no * sizeof (mpcx_t));
   /* copy factors into op */
   for (i = 0; i < no; i++)
      mpcx_init_set (op [i], factors [i]);

   while (no > 1) {
      for (i = 0; i < no / 2; i++)
         /* replace op [i] by op [2i] * op [2i+1] */
         mpcx_mul (op [i], op [2*i], op [2*i+1]);
      if (no % 2 != 0)
         mpcx_swap (op [no / 2], op [no - 1]);
      for (i = (no + 1) / 2; i < no; i++)
         mpcx_clear (op [i]);
      no = (no + 1) / 2;
      layer++;
   }

   /* The result is now in op [0]. */
   mpcx_set (rop, op [0]);
   mpcx_clear (op [0]);
   free (op);
}

/***************************************************************************/