/* xlsqr v1.0.0 Ver.: OK (c) 04/10 - Christos Iosifides */
/*
   This program solves the least squares system Ax+Bu=w in function xlsqr

   Name:			xlsqr
   Version:		    1.0.0
   Copyright:	    Ch Iossif @ 2010
   Author:		    Christos Iosifidis
   Date:			01/04/10 12:00
   Modified:		01/04/10 12:00 - 16:30 468 lines in 4h30' plus debugging !!!
   Description:     Solves the least squares system Ax+Bu=w in function xlsqr
   Compile:
            Linux:     gcc xlsqr.c -o xlsqr -lm
            Windows:   gcc xlsqr.c -o xlsqr.exe -lm or similar with any ANSI C compiler

  Input:

  M N C
  A(CxM)
  B(CxN)
  P(N)
  W(C)

  Output:

  X(M)
  U(N)
  Sigma
  Vx(MxM)

 Copyright (C) Apr 2010 Ch Iossif <chiossif@yahoo.com>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define myABS(x) (((x)<0)?-(x):(x))
#define mySQR(x) ((x)*(x))

//#define FP_TYPE double
//#define EPSILON 1.e-15
//#define FP_FORMAT "%16.10lf",

#define FP_TYPE long double
#define EPSILON 1.e-19
#define FP_FORMAT "%24.15Lf",

//#define FP_TYPE __float128
//#define EPSILON 1.e-34
//#define FP_FORMAT "%40.34Lf",(long double)

FP_TYPE xlsqr(FP_TYPE *A, FP_TYPE *B, FP_TYPE *P, FP_TYPE *W, FP_TYPE *X, FP_TYPE *U, FP_TYPE *Vx, int M, int N, int C);
FP_TYPE mySQRT(FP_TYPE);
void gpl(void);
int Matrix_inversion(FP_TYPE *,FP_TYPE *,FP_TYPE *,int);
int Gaussian_elimination(FP_TYPE *,FP_TYPE *,FP_TYPE *,int *,int);
int Cholesky_decomposition(FP_TYPE *, FP_TYPE *, int);
void print_mat(const char *, FP_TYPE *, int, int);

int main(void) {
    int m, n, c;
    FP_TYPE *a, *b, *p, *w, *x, *u, *vx, sigma;
    long double t;
    register int i, j;

    scanf("%d %d %d", &m, &n, &c);
    a=malloc(c*m*sizeof(FP_TYPE));
    b=malloc(c*n*sizeof(FP_TYPE));
    p=malloc(n*sizeof(FP_TYPE));
    w=malloc(c*sizeof(FP_TYPE));
    x=malloc(m*sizeof(FP_TYPE));
    u=malloc(n*sizeof(FP_TYPE));
    vx=malloc(m*m*sizeof(FP_TYPE));

    for (i=0; i<c; i++)
        for (j=0; j<m; j++) {
            scanf("%Lf",&t);
            a[i*m+j]=t;
        }
    for (i=0; i<c; i++)
        for (j=0; j<n; j++) {
            scanf("%Lf",&t);
            b[i*n+j]=t;
        }
    for (i=0; i<n; i++) {
        scanf("%Lf",&t);
        p[i]=t;
    }
    for (i=0; i<c; i++) {
        scanf("%Lf",&t);
        w[i]=t;
    }

    printf("\nxlsqr solves the least squares system Ax+Bu=w\n");
    printf("\nINPUT:\n");
    printf("\nM = %d N = %d C = %d\n", m, n, c);
    print_mat("A",a,c,m);
    print_mat("B",b,c,n);
    print_mat("P",p,1,n);
    /*
        printf("\nP(%dx%d) =\n",n,n);
        for (i=0; i<n; i++) {
            for (j=0; j<n; j++)
                if (i-j)
                    printf(FP_FORMAT 0.0);
                else
                    printf(FP_FORMAT p[i]);
            printf("\n");
        }
    */
    print_mat("W",w,1,c);

    sigma=xlsqr(a,b,p,w,x,u,vx,m,n,c);

    printf("\nOUTPUT:\n");
    print_mat("x",x,1,m);
    print_mat("U",u,1,n);
    printf("\nSigma = ");
    printf(FP_FORMAT sigma);
    printf("\n");
    print_mat("Vx",vx,m,m);

    printf("\nxlsqr v1.0.0 Ver.: OK (c) 04/10 - Christos Iosifides\n");
    gpl();
    return 0;
}

FP_TYPE xlsqr(FP_TYPE *a, FP_TYPE *b, FP_TYPE *p, FP_TYPE *w, FP_TYPE *x, FP_TYPE *u, FP_TYPE *vx, int m, int n, int c) {
    FP_TYPE Sigma, *Pa, *Na, *Na_inv;
    FP_TYPE *tmp1, *tmp2, s;
    register int i, j, k;

    /* * * Pa = (B*P'*Bt)' * * */
    Pa=malloc(c*c*sizeof(FP_TYPE));
    /* tmp1 = B*P' */
    tmp1=malloc(c*n*sizeof(FP_TYPE));
    for (i=0;i<c;i++) {
        for (j=0;j<n;j++) {
            tmp1[i*n+j]=0.0;
            for (k=0;k<n;k++) {
                if (j==k) {
                    tmp1[i*n+j]+=b[i*n+k]/p[k];
                }
            }
        }
    }
    /* tmp2 = tmp1*Bt */
    tmp2=malloc(c*c*sizeof(FP_TYPE));
    for (i=0;i<c;i++) {
        for (j=0;j<c;j++) {
            tmp2[i*c+j]=0.0;
            for (k=0;k<n;k++) {
                tmp2[i*c+j]+=tmp1[i*n+k]*b[j*n+k];
            }
        }
    }
    free(tmp1);
    /* Pa = tmp2' */
    Matrix_inversion(tmp2,Pa,&s,c);
    free(tmp2);
//    print_mat("Pa=(B*P'*Bt)\'",Pa,c,c);

    /* * * Na = At*Pa*A * * */
    Na=malloc(m*m*sizeof(FP_TYPE));
    /* tmp1 = At*Pa */
    tmp1=malloc(m*c*sizeof(FP_TYPE));
    for (i=0;i<m;i++) {
        for (j=0;j<c;j++) {
            tmp1[i*c+j]=0.0;
            for (k=0;k<c;k++) {
                tmp1[i*c+j]+=a[k*m+i]*Pa[k*c+j];
            }
        }
    }
    /* Na = tmp1*A */
    for (i=0;i<m;i++) {
        for (j=0;j<m;j++) {
            Na[i*m+j]=0.0;
            for (k=0;k<c;k++) {
                Na[i*m+j]+=tmp1[i*c+k]*a[k*m+j];
            }
        }
    }
    free(tmp1);
//    print_mat("Na=At*Pa*A",Na,m,m);

    /* * * Na_inv = Na' * * */
    Na_inv=vx;//malloc(m*m*sizeof(FP_TYPE));
    Matrix_inversion(Na,Na_inv,&s,m);
    free(Na);

    /* * * x = Na_inv*At*Pa*w * * */
    /* tmp1 = Na_inv*At */
    tmp1=malloc(m*c*sizeof(FP_TYPE));
    for (i=0;i<m;i++) {
        for (j=0;j<c;j++) {
            tmp1[i*c+j]=0.0;
            for (k=0;k<m;k++) {
                tmp1[i*c+j]+=Na_inv[i*m+k]*a[j*m+k];
            }
        }
    }
    /* tmp2 = tmp1*Pa */
    tmp2=malloc(m*c*sizeof(FP_TYPE));
    for (i=0;i<m;i++) {
        for (j=0;j<c;j++) {
            tmp2[i*c+j]=0.0;
            for (k=0;k<c;k++) {
                tmp2[i*c+j]+=tmp1[i*c+k]*Pa[k*c+j];
            }
        }
    }
    free(tmp1);
    /* x = tmp2*w */
    for (i=0;i<m;i++) {
        x[i]=0.0;
        for (k=0;k<c;k++) {
            x[i]+=tmp2[i*c+k]*w[k];
        }
    }
    free(tmp2);

    /* * * u = P'*Bt*Pa*(w-A*x) * * */
    /* tmp1 = P'*Bt */
    tmp1=malloc(n*c*sizeof(FP_TYPE));
    for (i=0;i<n;i++) {
        for (j=0;j<c;j++) {
            tmp1[i*c+j]=0.0;
            for (k=0;k<n;k++) {
                if (i==k) {
                    tmp1[i*c+j]+=b[j*n+k]/p[i];
                }
            }
        }
    }
    /* tmp2 = tmp1*Pa */
    tmp2=malloc(n*c*sizeof(FP_TYPE));
    for (i=0;i<n;i++) {
        for (j=0;j<c;j++) {
            tmp2[i*c+j]=0.0;
            for (k=0;k<c;k++) {
                tmp2[i*c+j]+=tmp1[i*c+k]*Pa[k*c+j];
            }
        }
    }
    free(tmp1);
    free(Pa);
    /* tmp1 = w-A*x */
    tmp1=malloc(c*sizeof(FP_TYPE));
    for (i=0;i<c;i++) {
        tmp1[i]=w[i];
        for (j=0;j<m;j++) {
            tmp1[i]-=a[i*m+j]*x[j];
        }
    }
    /* u = tmp2*tmp1 */
    for (i=0;i<n;i++) {
        u[i]=0.0;
        for (j=0;j<c;j++) {
            u[i]+=tmp2[i*c+j]*tmp1[j];
        }
    }
    free(tmp1);
    free(tmp2);

    /* * * so^2 = ut*P*u * * */
    /* tmp1 = ut*P */
    tmp1=malloc(n*sizeof(FP_TYPE));
    for (i=0;i<n;i++) {
        tmp1[i]=u[i]*p[i];
        /*
                tmp1[i]=0.0;
                for (j=0;j<n;j++) {
                    if (i==j){
                        tmp1[i]+=u[i]*p[j];
                    }
                }
        */
    }
    /* s = tmp1*u */
    s=0.0;
    for (i=0;i<n;i++) {
        s+=tmp1[i]*u[i];
    }
    Sigma=s/(n-m);
    free(tmp1);

    /* * * Vx=so^2*Na_inv * * */
    for (i=0;i<n;i++) {
        for (j=0;j<m;j++) {
            vx[i*m+j]*=Sigma;
        }
    }

    return mySQRT(Sigma);
}

int Matrix_inversion(FP_TYPE *ata,FP_TYPE *ata_inv,FP_TYPE *det,int n) {
    FP_TYPE *tmpa, *b;
    int *l;
    int r_value;
    register int i, j, k;

    tmpa=malloc(n*n*sizeof(FP_TYPE));
    b=malloc(n*sizeof(FP_TYPE));
    l=malloc(n*sizeof(int));
    r_value=1;
    for (i=0; i<n; i++) {
        for (j=0; j<n; j++) {
            for (k=0; k<n; k++)
                tmpa[j*n+k]=ata[j*n+k];
            b[j]=0.0;
        }
        b[i]=1.0;
        r_value*=Gaussian_elimination(tmpa,b,det,l,n);
//        r_value*=Cholesky_decomposition(tmpa, b, n);
        for (j=0; j<n; j++)
            ata_inv[j*n+i]=b[j];
    }
    return r_value;
}

int Gaussian_elimination(FP_TYPE *a, FP_TYPE *b, FP_TYPE *det, int *l, int n) {
    FP_TYPE s, d;
    int n_1, m;
    register int i, j, k;

    d=1.0;
    n_1=n-1;
    for (k=0; k<n_1; k++) {
        m=k;
        for (i=k+1; i<n; i++)
            if (myABS(a[i*n+k])>myABS(a[m*n+k]))
                m=i;
        l[k]=m;
        d*=a[m*n+k];
        if (myABS(a[m*n+k])<EPSILON)
            return 0;
        if (m!=k) {
            d=-d;
            for (j=k; j<n; j++) {
                s=a[m*n+j];
                a[m*n+j]=a[k*n+j];
                a[k*n+j]=s;
            }
        }
        for (i=k+1; i<n; i++) {
            s=a[i*n+k]/a[k*n+k];
            a[i*n+k]=s;
            for (j=k+1; j<n; j++)
                a[i*n+j]-=s*a[k*n+j];
        }
    }
    d*=a[n_1*n+n_1];
    if (myABS(d)<EPSILON)
        return 0;
    *det=d;
    for (k=0; k<n_1; k++) {
        m=l[k];
        s=b[m];
        b[m]=b[k];
        b[k]=s;
        for (i=k+1; i<n; i++)
            b[i]-=s*a[i*n+k];
    }
    b[n_1]/=a[n_1*n+n_1];
    for (k=n-2; k>=0; k--) {
        s=0.;
        for (j=k+1; j<n; j++)
            s+=a[k*n+j]*b[j];
        b[k]=(b[k]-s)/a[k*n+k];
    }
    return 1;
}

int Cholesky_decomposition(FP_TYPE *a, FP_TYPE *b, int n) {
    FP_TYPE s;
    register int i, j, k;

    for (i=0; i<n; i++) {
        s=0.0;
        if (i)
            for (j=0; j<i; j++)
                s+=mySQR(a[i*n+j]);
        a[i*n+i]=mySQRT(a[i*n+i]-s);
        if (myABS(a[i*n+i])<EPSILON)
            return 0;
        if (i<n-1)
            for (j=i+1; j<n; j++) {
                s=0.0;
                if (i)
                    for (k=0; k<i; k++)
                        s+=a[i*n+k]*a[j*n+k];
                a[j*n+i]=(a[j*n+i]-s)/a[i*n+i];
            }
    }
    b[0]/=a[0*n+0];
    for (k=1; k<n; k++) {
        s=0.0;
        for (j=0; j<k; j++)
            s+=a[k*n+j]*b[j];
        b[k]=(b[k]-s)/a[k*n+k];
    }
    b[n-1]/=a[(n-1)*n+n-1];
    for (i=0; i<n-1; i++) {
        s=0.0;
        for (j=n-i-1; j<n; j++)
            s+=a[j*n+n-i-2]*b[j];
        b[n-i-2]=(b[n-i-2]-s)/a[(n-i-2)*n+n-i-2];
    }
    return 1;
}

void gpl(void) {
    puts("\nCopyright (C) Apr 2010 Ch Iossif <chiossif@yahoo.com>");

    puts("\nThis program is free software: you can redistribute it and/or modify");
    puts("it under the terms of the GNU General Public License as published by");
    puts("the Free Software Foundation, either version 3 of the License, or");
    puts("(at your option) any later version.");

    puts("\nThis program is distributed in the hope that it will be useful,");
    puts("but WITHOUT ANY WARRANTY; without even the implied warranty of");
    puts("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the");
    puts("GNU General Public License for more details.");

    puts("\nYou should have received a copy of the GNU General Public License");
    puts("along with this program.  If not, see <http://www.gnu.org/licenses/>.\n");
}

FP_TYPE mySQRT(FP_TYPE x) {
    FP_TYPE px,a;

    a=px=x;
    x/=2.0;
    while (myABS(x-px)>EPSILON) {
        px=x;
        x=(px+a/px)/2;
    }
    return x;
}

void print_mat(const char *s, FP_TYPE *a, int m, int n) {
    register i,j;

    printf("\n%s(%dx%d)=\n",s,m,n);
    for (i=0;i<m;i++) {
        for (j=0;j<n;j++)
            printf(FP_FORMAT a[i*n+j]);
        printf("\n");
    }
    return;
}

