/*
 * Decompiled with CFR 0.152.
 */
package eqtlmappingpipeline.normalization;

import JSci.maths.ArrayMath;
import Jama.EigenvalueDecomposition;
import cern.jet.stat.Probability;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.commons.math3.stat.ranking.NaNStrategy;
import org.apache.commons.math3.stat.ranking.NaturalRanking;
import org.apache.commons.math3.stat.ranking.TiesStrategy;
import umcg.genetica.console.ProgressBar;
import umcg.genetica.containers.Pair;
import umcg.genetica.io.Gpio;
import umcg.genetica.io.text.TextFile;
import umcg.genetica.math.PCA;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
import umcg.genetica.math.matrix.MatrixHandling;
import umcg.genetica.math.matrix.MatrixTools;
import umcg.genetica.math.stats.Descriptives;
import umcg.genetica.math.stats.Log2Transform;
import umcg.genetica.math.stats.QuantileNormalization;
import umcg.genetica.math.stats.Regression;
import umcg.genetica.math.stats.concurrent.ConcurrentCorrelation;
import umcg.genetica.math.stats.concurrent.ConcurrentCovariation;
import umcg.genetica.methylation.ConvertBetaAndMvalues;

public class Normalizer {
    public void normalize(String expressionFile, String probeIncludeList, String sampleIncludeList, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, String covariatesToRemove, boolean orthogonalizecovariates, String outdir, boolean runQQNorm, boolean runLog2Transform, boolean runMTransform, boolean runCenterScale, boolean runPCA, boolean adjustCovariates, boolean forceMissingValues, boolean forceReplacementOfMissingValues, boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls, boolean forceNormalDistribution) throws IOException {
        System.out.println("Running normalization.");
        if (outdir != null) {
            outdir = Gpio.formatAsDirectory(outdir);
            Gpio.createDir(outdir);
        } else {
            outdir = Gpio.getParentDir(expressionFile) == null ? "" : Gpio.getParentDir(expressionFile) + Gpio.getFileSeparator();
        }
        String parentDir = Gpio.getParentDir(expressionFile);
        String expressionFileName = Gpio.getFileName(expressionFile);
        if (parentDir == null) {
            parentDir = "";
        }
        expressionFileName = expressionFileName.contains(".txt.gz") ? expressionFileName.replaceAll(".txt.gz", "") : expressionFileName.replaceAll(".txt", "");
        String outputFileNamePrefix = outdir + expressionFileName;
        HashSet<String> s = null;
        if (sampleIncludeList != null) {
            TextFile t = new TextFile(sampleIncludeList, false);
            s = new HashSet<String>(t.readAsArrayList());
        }
        HashSet<String> p = null;
        if (probeIncludeList != null) {
            TextFile t = new TextFile(probeIncludeList, false);
            p = new HashSet<String>(t.readAsArrayList());
        }
        DoubleMatrixDataset<String, String> dataset = null;
        if (s != null || p != null) {
            HashSet<String> extraNames;
            HashSet<String> missingNames;
            HashSet<String> tmpNames;
            dataset = new DoubleMatrixDataset(expressionFile, p, s);
            boolean breakAfterCheck = false;
            if (s != null) {
                outputFileNamePrefix = outputFileNamePrefix + ".SampleSelection";
                tmpNames = new HashSet<String>();
                tmpNames.addAll(dataset.colObjects);
                tmpNames.addAll(s);
                missingNames = new HashSet<String>();
                extraNames = new HashSet<String>();
                for (String colName : tmpNames) {
                    if (!s.contains(colName)) {
                        extraNames.add(colName);
                    }
                    if (dataset.colObjects.contains(colName)) continue;
                    missingNames.add(colName);
                }
                if (!missingNames.isEmpty()) {
                    System.err.println("\nMatrix does not contains desired columns, please check filtering list.");
                    System.err.println(missingNames.toString() + "\n");
                    breakAfterCheck = true;
                } else if (!extraNames.isEmpty()) {
                    System.err.println("\nMatrix contains unwanted columns, please check filtering list.");
                    System.err.println(extraNames.toString() + "\n");
                    breakAfterCheck = true;
                }
            }
            if (p != null) {
                outputFileNamePrefix = outputFileNamePrefix + ".ProbeSelection";
                tmpNames = new HashSet();
                tmpNames.addAll(dataset.rowObjects);
                tmpNames.addAll(p);
                missingNames = new HashSet();
                extraNames = new HashSet();
                for (String rowName : tmpNames) {
                    if (!p.contains(rowName)) {
                        extraNames.add(rowName);
                    }
                    if (dataset.rowObjects.contains(rowName)) continue;
                    missingNames.add(rowName);
                }
                if (!missingNames.isEmpty()) {
                    System.err.println("\nMatrix does not contains desired rows, please check filtering list.");
                    System.err.println(missingNames.toString() + "\n");
                    breakAfterCheck = true;
                } else if (!extraNames.isEmpty()) {
                    System.err.println("\nMatrix contains unwanted rows, please check filtering list.");
                    System.err.println(extraNames.toString() + "\n");
                    breakAfterCheck = true;
                }
            }
            if (breakAfterCheck) {
                System.exit(-1);
            }
            dataset.save(outputFileNamePrefix + ".txt.gz");
        } else {
            dataset = new DoubleMatrixDataset(expressionFile);
        }
        if (dataset.nrCols > 3) {
            outputFileNamePrefix = this.removeProbesWithZeroVariance(dataset, outputFileNamePrefix);
        }
        if (runQQNorm) {
            outputFileNamePrefix = this.quantileNormalize(dataset, outputFileNamePrefix, forceMissingValues, forceReplacementOfMissingValues, forceReplacementOfMissingValues2, treatZerosAsNulls);
        }
        if (runLog2Transform) {
            outputFileNamePrefix = this.log2transform(dataset, outputFileNamePrefix);
        }
        if (runMTransform) {
            outputFileNamePrefix = this.mValueTransform(dataset, outputFileNamePrefix);
        }
        if (runCenterScale) {
            outputFileNamePrefix = this.centerAndScale(dataset, outputFileNamePrefix);
        }
        if (adjustCovariates && covariatesToRemove != null) {
            outputFileNamePrefix = this.adjustCovariates(dataset, outputFileNamePrefix, covariatesToRemove, orthogonalizecovariates, 1.0E-10);
        }
        if (runPCA) {
            ConcurrentCorrelation c = new ConcurrentCorrelation(2);
            double[][] correlationMatrix = c.pairwiseCorrelation(dataset.getRawDataTransposed());
            Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>> PCAResults = this.calculatePCA(dataset, correlationMatrix, outputFileNamePrefix, null);
            if (nrPCAsOverSamplesToRemove != 0 || nrIntermediatePCAsOverSamplesToRemoveToOutput != 0) {
                this.correctDataForPCs(dataset, outputFileNamePrefix, nrPCAsOverSamplesToRemove, nrIntermediatePCAsOverSamplesToRemoveToOutput, PCAResults.getLeft(), PCAResults.getRight());
            }
        }
        if (forceNormalDistribution) {
            outputFileNamePrefix = this.forceNormalDistribution(dataset, outputFileNamePrefix);
        }
    }

    public String forceNormalDistribution(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix) throws IOException {
        double[][] rawData = dataset.getRawData();
        NaturalRanking ranking = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE);
        for (int p = 0; p < dataset.rowObjects.size(); ++p) {
            double[] rankedValues = ranking.rank(rawData[p]);
            for (int s = 0; s < dataset.colObjects.size(); ++s) {
                double pValue = (0.5 + rankedValues[s] - 1.0) / (double)rankedValues.length;
                rawData[p][s] = Probability.normalInverse((double)pValue);
            }
        }
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects);
        fileNamePrefix = fileNamePrefix + ".ForcedNormal";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        return fileNamePrefix;
    }

    public String quantileNormalize(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix, boolean forceMissingValues, boolean forceReplacementOfMissingValues, boolean forceReplacementOfMissingValues2, boolean treatZerosAsNulls) throws IOException {
        double[][] rawData = dataset.getRawData();
        boolean dataContainsNulls = MatrixTools.containsNaNs(rawData);
        if (treatZerosAsNulls && dataContainsNulls) {
            System.out.println("Warning: Data already contains nulls before treating zeros as nulls.\n Later on it will not be possible to distinguish between those two!");
        }
        if (treatZerosAsNulls) {
            MatrixHandling.ReplaceZerosToNull(rawData);
            dataContainsNulls = MatrixTools.containsNaNs(rawData);
        }
        if (!dataContainsNulls) {
            QuantileNormalization.quantilenormalize(rawData);
        } else if (forceReplacementOfMissingValues) {
            QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, false, false);
        } else if (forceReplacementOfMissingValues2) {
            QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, false, true, false);
        } else if (forceMissingValues && treatZerosAsNulls) {
            QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, true);
        } else if (forceMissingValues) {
            QuantileNormalization.QuantileNormAdressingNaValuesAfterInitialQN(dataset, true, false, false);
        } else {
            System.out.println("Warning: Your data contains missing values and missing value treatment is not selected.\nIf desired please supply additional flag: --forceMissingValues or --forceReplacementOfMissingValues");
            System.exit(0);
        }
        if (treatZerosAsNulls) {
            MatrixHandling.ReplaceNullToZero(rawData);
        }
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects);
        fileNamePrefix = fileNamePrefix + ".QuantileNormalized";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        return fileNamePrefix;
    }

    public String log2transform(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix) throws IOException {
        double[][] rawData = dataset.getRawData();
        Log2Transform.log2transform(rawData);
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects);
        fileNamePrefix = fileNamePrefix + ".Log2Transformed";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        return fileNamePrefix;
    }

    public String mValueTransform(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix) throws IOException {
        double[][] rawData = dataset.getRawData();
        ConvertBetaAndMvalues.transformToMvalue(rawData);
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects);
        fileNamePrefix = fileNamePrefix + ".MvalueTransformed";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        return fileNamePrefix;
    }

    public String centerAndScale(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix) throws IOException {
        double[][] rawData = dataset.getRawData();
        System.out.println("Standardizing probe mean");
        for (int p = 0; p < dataset.rowObjects.size(); ++p) {
            double mean = Descriptives.mean(rawData[p]);
            int s = 0;
            while (s < dataset.colObjects.size()) {
                double[] dArray = rawData[p];
                int n = s++;
                dArray[n] = dArray[n] - mean;
            }
        }
        dataset.setRawData(rawData);
        fileNamePrefix = fileNamePrefix + ".ProbesCentered";
        dataset.save(fileNamePrefix + ".txt.gz");
        System.out.println("- Standardizing sample mean and standard deviation");
        for (int s = 0; s < dataset.colObjects.size(); ++s) {
            double[] vals = new double[dataset.rowObjects.size()];
            for (int p = 0; p < dataset.rowObjects.size(); ++p) {
                vals[p] = dataset.getRawData()[p][s];
            }
            double mean = Descriptives.mean(vals);
            int p = 0;
            while (p < dataset.rowObjects.size()) {
                int n = p++;
                vals[n] = vals[n] - mean;
            }
            double var = Descriptives.variance(vals, mean);
            double stdev = Math.sqrt(var);
            for (int p2 = 0; p2 < dataset.rowObjects.size(); ++p2) {
                dataset.getRawData()[p2][s] = vals[p2] / stdev;
            }
        }
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawData, dataset.rowObjects, dataset.colObjects);
        fileNamePrefix = fileNamePrefix + ".SamplesZTransformed";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        return fileNamePrefix;
    }

    public String adjustCovariates(DoubleMatrixDataset<String, String> traitData, String fileNamePrefix, String covariatesToRemove, boolean orthogonalizecovariates, double varianceExplainedCutoff) throws IOException {
        Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>> covariateData = this.loadCovariateValues(covariatesToRemove, traitData);
        DoubleMatrixDataset<String, String> covariateDataset = covariateData.getLeft();
        DoubleMatrixDataset<String, String> traitDataUpdated = covariateData.getRight();
        traitData.rawData = traitDataUpdated.rawData;
        traitData.colObjects = traitDataUpdated.colObjects;
        traitData.rowObjects = traitDataUpdated.rowObjects;
        traitData.recalculateHashMaps();
        double[][] covariateValues = null;
        double[] pcaExpVar = null;
        System.out.println("Covariate data has " + covariateDataset.nrRows + " rows and " + covariateDataset.nrCols + " columns.");
        for (int p = 0; p < covariateDataset.rowObjects.size(); ++p) {
            double mean = Descriptives.mean(covariateDataset.getRawData()[p]);
            double stdev = Math.sqrt(Descriptives.variance(covariateDataset.getRawData()[p], mean));
            int s = 0;
            while (s < covariateDataset.colObjects.size()) {
                double[] dArray = covariateDataset.getRawData()[p];
                int n = s;
                dArray[n] = dArray[n] - mean;
                double[] dArray2 = covariateDataset.getRawData()[p];
                int n2 = s++;
                dArray2[n2] = dArray2[n2] / stdev;
            }
        }
        ConcurrentCovariation c = new ConcurrentCovariation(2);
        double[][] correlationMatrix = c.pairwiseCovariation(covariateDataset.getRawData());
        covariateDataset.transposeDataset();
        Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>> PCAResults = this.calculatePCA(covariateDataset, correlationMatrix, covariatesToRemove, null);
        covariateDataset = PCAResults.getLeft();
        covariateDataset.transposeDataset();
        covariateValues = covariateDataset.getRawData();
        System.out.println(covariateDataset.nrRows + " covariates finally loaded.");
        pcaExpVar = new double[covariateValues.length];
        System.out.println("Loading eigenvalues from: " + covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz");
        TextFile tf = new TextFile(covariatesToRemove + ".PCAOverSamplesEigenvalues.txt.gz", false);
        tf.readLine();
        String[] elems = tf.readLineElems(TextFile.tab);
        while (elems != null) {
            if (elems.length > 2) {
                double expvar;
                int pcanr = Integer.parseInt(elems[0]);
                pcaExpVar[pcanr - 1] = expvar = Double.parseDouble(elems[1]);
                System.out.println(pcanr + "\t" + expvar);
            }
            elems = tf.readLineElems(TextFile.tab);
        }
        tf.close();
        double[][] rawdata = traitData.getRawData();
        for (int i = 0; i < covariateValues.length; ++i) {
            if (pcaExpVar == null || pcaExpVar[i] > varianceExplainedCutoff) {
                this.correctForCovariate(rawdata, covariateValues, i);
                continue;
            }
            System.out.println("Not regressing covariate: " + i + " because explained variance < " + varianceExplainedCutoff + ": " + pcaExpVar[i]);
        }
        DoubleMatrixDataset datasetNormalized = new DoubleMatrixDataset(rawdata, traitData.rowObjects, traitData.colObjects);
        fileNamePrefix = fileNamePrefix + ".CovariatesRemoved";
        datasetNormalized.save(fileNamePrefix + ".txt.gz");
        traitData.rawData = rawdata;
        return fileNamePrefix;
    }

    private double[][] correlateSamples(DoubleMatrixDataset<String, String> dataset) {
        double[][] correlationMatrix = new double[dataset.colObjects.size()][dataset.colObjects.size()];
        double probeCountMinusOne = dataset.rowObjects.size() - 1;
        ProgressBar pb = new ProgressBar(dataset.colObjects.size(), "- Calculating correlations: " + dataset.colObjects.size() + " x " + dataset.colObjects.size());
        for (int f = 0; f < dataset.colObjects.size(); ++f) {
            for (int g = f; g < dataset.colObjects.size(); ++g) {
                double covariance;
                double covarianceInterim = 0.0;
                for (int p = 0; p < dataset.rowObjects.size(); ++p) {
                    covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g];
                }
                correlationMatrix[f][g] = covariance = covarianceInterim / probeCountMinusOne;
                correlationMatrix[g][f] = covariance;
            }
            pb.iterate();
        }
        pb.close();
        return correlationMatrix;
    }

    public double[][] correlateProbes(DoubleMatrixDataset<String, String> dataset) {
        double[][] correlationMatrix = new double[dataset.rowObjects.size()][dataset.rowObjects.size()];
        double probeCountMinusOne = dataset.rowObjects.size() - 1;
        ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "- Calculating correlations: " + dataset.rowObjects.size() + " x " + dataset.rowObjects.size());
        for (int f = 0; f < dataset.rowObjects.size(); ++f) {
            for (int g = f; g < dataset.rowObjects.size(); ++g) {
                double covariance;
                double covarianceInterim = 0.0;
                for (int p = 0; p < dataset.rowObjects.size(); ++p) {
                    covarianceInterim += dataset.getRawData()[p][f] * dataset.getRawData()[p][g];
                }
                correlationMatrix[f][g] = covariance = covarianceInterim / probeCountMinusOne;
                correlationMatrix[g][f] = covariance;
                System.out.println(f + "\t" + g + "\t" + covariance);
            }
            pb.iterate();
        }
        pb.close();
        return correlationMatrix;
    }

    public Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>> calculatePCA(DoubleMatrixDataset<String, String> dataset, double[][] correlationMatrix, String fileNamePrefix, Integer nrOfPCsToCalculate) throws IOException {
        String expressionFile = fileNamePrefix;
        System.out.println("Calculating PCA over file: " + fileNamePrefix);
        System.out.println("- Performing PCA over correlation matrix of size: " + correlationMatrix.length + "x" + correlationMatrix.length);
        EigenvalueDecomposition eig = PCA.eigenValueDecomposition(correlationMatrix);
        if (nrOfPCsToCalculate == null || nrOfPCsToCalculate > dataset.colObjects.size()) {
            nrOfPCsToCalculate = dataset.colObjects.size();
        } else if (nrOfPCsToCalculate < 1) {
            throw new IllegalArgumentException("Number of PCs to calculate should be at least 1");
        }
        DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(dataset.colObjects.size(), nrOfPCsToCalculate);
        datasetEV.rowObjects = dataset.colObjects;
        double[] eigenValues = eig.getRealEigenvalues();
        System.out.println("Eigenvalue results:");
        System.out.println("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance");
        TextFile out = new TextFile(expressionFile + ".PCAOverSamplesEigenvalues.txt.gz", true);
        double cumExpVarPCA = 0.0;
        out.writeln("PCA\tPCANr\tEigenValue\tExplainedVariance\tTotalExplainedVariance");
        for (int pca = 0; pca < nrOfPCsToCalculate; ++pca) {
            double expVarPCA = PCA.getEigenValueVar(eigenValues, pca);
            double[] pca1ExpEigenVector = PCA.getEigenVector(eig, eigenValues, pca);
            for (int s = 0; s < dataset.colObjects.size(); ++s) {
                datasetEV.getRawData()[s][pca] = pca1ExpEigenVector[s];
            }
            int pcaNr = pca + 1;
            out.write(pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + (cumExpVarPCA += expVarPCA) + "\n");
            datasetEV.colObjects.set(pca, "Comp" + String.valueOf(pcaNr));
            System.out.println("PCA:\t" + pcaNr + "\t" + eigenValues[eigenValues.length - 1 - pca] + "\t" + expVarPCA + "\t" + cumExpVarPCA);
        }
        out.close();
        datasetEV.save(expressionFile + ".PCAOverSamplesEigenvectors.txt.gz");
        datasetEV.transposeDataset();
        datasetEV.save(expressionFile + ".PCAOverSamplesEigenvectorsTransposed.txt.gz");
        datasetEV.transposeDataset();
        System.out.println("Calculating PCs");
        System.out.println("Initializing PCA matrix");
        DoubleMatrixDataset datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(dataset.rowObjects.size(), nrOfPCsToCalculate);
        datasetPCAOverSamplesPCAs.rowObjects = dataset.rowObjects;
        for (int s = 0; s < nrOfPCsToCalculate; ++s) {
            datasetPCAOverSamplesPCAs.colObjects.set(s, "Comp" + String.valueOf(s + 1));
        }
        for (int p = 0; p < dataset.rowObjects.size(); ++p) {
            for (int t = 0; t < nrOfPCsToCalculate; ++t) {
                datasetPCAOverSamplesPCAs.getRawData()[p][t] = 0.0;
            }
        }
        ProgressBar pb = new ProgressBar(dataset.rowObjects.size(), "Calculating the PCA scores per probe: ");
        for (int probe = 0; probe < dataset.rowObjects.size(); ++probe) {
            for (int sample1 = 0; sample1 < nrOfPCsToCalculate; ++sample1) {
                for (int sample2 = 0; sample2 < dataset.colObjects.size(); ++sample2) {
                    double probeCoefficient = datasetEV.getRawData()[sample2][sample1];
                    double[] dArray = datasetPCAOverSamplesPCAs.getRawData()[probe];
                    int n = sample1;
                    dArray[n] = dArray[n] + probeCoefficient * dataset.getRawData()[probe][sample2];
                }
            }
            pb.iterate();
        }
        pb.close();
        String outfilename = expressionFile + ".PCAOverSamplesPrincipalComponents.txt.gz";
        System.out.println("Saving PCA scores: " + outfilename);
        datasetPCAOverSamplesPCAs.save(outfilename);
        return new Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>>(datasetPCAOverSamplesPCAs, datasetEV);
    }

    public void correctDataForPCs(DoubleMatrixDataset<String, String> dataset, String fileNamePrefix, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput, DoubleMatrixDataset<String, String> datasetPCAOverSamplesPCAs, DoubleMatrixDataset<String, String> datasetEV) throws IOException {
        String expressionFile = fileNamePrefix;
        System.out.println("\nInitializing residual gene expression matrix");
        if (dataset.colObjects.size() < nrPCAsOverSamplesToRemove) {
            int remainder = dataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput;
            nrPCAsOverSamplesToRemove = dataset.colObjects.size() - remainder;
        }
        for (int t = 0; t < nrPCAsOverSamplesToRemove; ++t) {
            for (int p = 0; p < dataset.rowObjects.size(); ++p) {
                for (int s = 0; s < dataset.colObjects.size(); ++s) {
                    double[] dArray = dataset.getRawData()[p];
                    int n = s;
                    dArray[n] = dArray[n] - datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t];
                }
            }
            int nrPCAs = t + 1;
            if (nrIntermediatePCAsOverSamplesToRemoveToOutput <= 0 || nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput != 0) continue;
            dataset.save(expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz");
            System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + expressionFile + "." + nrPCAs + "PCAsOverSamplesRemoved.txt.gz");
        }
        dataset.save(expressionFile + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved.txt.gz");
    }

    public void repeatPCAOmitCertainPCAs(HashSet<Integer> pcasNotToRemove, String parentDir, String expressionFile, int nrPCAsOverSamplesToRemove, int nrIntermediatePCAsOverSamplesToRemoveToOutput) throws IOException {
        System.out.println("Will write output to: " + parentDir);
        String[] files = Gpio.getListOfFiles(parentDir);
        String startExpressionFileName = expressionFile;
        File st = new File(startExpressionFileName);
        parentDir = parentDir + Gpio.getFileSeparator();
        String minimalFilename = st.getName();
        String[] expressionFileNameElems = minimalFilename.split("\\.");
        String eigenvectorFile = null;
        String principalComponentsFile = null;
        if (minimalFilename.contains("PCAsOverSamplesRemoved")) {
            StringBuilder newMinimal = new StringBuilder();
            newMinimal.append(expressionFileNameElems[0]);
            for (int i = 1; i < expressionFileNameElems.length; ++i) {
                if (expressionFileNameElems[i].contains("PCAsOverSamplesRemoved")) continue;
                newMinimal.append(".").append(expressionFileNameElems[i]);
            }
            minimalFilename = newMinimal.toString();
        }
        for (String file : files) {
            if (file.toLowerCase().contains("pcaoversampleseigenvectors.")) {
                eigenvectorFile = parentDir + "" + file;
                continue;
            }
            if (!file.toLowerCase().contains("pcaoversamplesprincipalcomponents")) continue;
            principalComponentsFile = parentDir + "" + file;
        }
        boolean fileFound = true;
        if (eigenvectorFile == null) {
            System.err.println("Could not find file containing 'PCAOverSamplesEigenvectors' in directory: " + parentDir);
            fileFound = false;
        }
        if (eigenvectorFile == null) {
            System.err.println("Could not find file containing 'PCAOverSamplesPrincipalComponents' in directory: " + parentDir);
            fileFound = false;
        }
        if (!fileFound) {
            System.exit(0);
        }
        System.out.println("Detected core file name to be: " + minimalFilename);
        DoubleMatrixDataset expressionDataset = new DoubleMatrixDataset(parentDir + minimalFilename);
        DoubleMatrixDataset datasetPCAOverSamplesPCAs = new DoubleMatrixDataset(principalComponentsFile);
        DoubleMatrixDataset datasetEV = new DoubleMatrixDataset(eigenvectorFile);
        if (expressionDataset.colObjects.size() < nrPCAsOverSamplesToRemove) {
            int remainder = expressionDataset.colObjects.size() % nrIntermediatePCAsOverSamplesToRemoveToOutput;
            nrPCAsOverSamplesToRemove = expressionDataset.colObjects.size() - remainder;
        }
        if (minimalFilename.endsWith(".txt")) {
            minimalFilename = minimalFilename.substring(0, minimalFilename.length() - 4);
        } else if (minimalFilename.endsWith(".txt.gz")) {
            minimalFilename = minimalFilename.substring(0, minimalFilename.length() - 7);
        }
        for (int t = 0; t < nrPCAsOverSamplesToRemove; ++t) {
            if (!pcasNotToRemove.contains(t + 1)) {
                for (int p = 0; p < expressionDataset.rowObjects.size(); ++p) {
                    for (int s = 0; s < expressionDataset.colObjects.size(); ++s) {
                        double[] dArray = expressionDataset.getRawData()[p];
                        int n = s;
                        dArray[n] = dArray[n] - datasetPCAOverSamplesPCAs.getRawData()[p][t] * datasetEV.getRawData()[s][t];
                    }
                }
            } else {
                System.out.println("Omitting PCA: " + (t + 1) + " since this component is under genetic control");
            }
            int nrPCAs = t + 1;
            if (nrIntermediatePCAsOverSamplesToRemoveToOutput <= 0 || nrPCAs % nrIntermediatePCAsOverSamplesToRemoveToOutput != 0) continue;
            expressionDataset.save(parentDir + minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz");
            System.out.println("Removed\t" + nrPCAs + "\tPCs. File:\t" + minimalFilename + "." + nrPCAs + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz");
        }
        expressionDataset.save(parentDir + minimalFilename + "." + nrPCAsOverSamplesToRemove + "PCAsOverSamplesRemoved-GeneticVectorsNotRemoved.txt.gz");
        System.out.println("Done\n");
    }

    private void correctForCovariate(double[][] rawdata, double[][] covariateValues, int covariateToCorrect) {
        for (int probe = 0; probe < rawdata.length; ++probe) {
            double[] y = rawdata[probe];
            double meanY = ArrayMath.mean((double[])y);
            double varianceY = ArrayMath.variance((double[])y);
            double[] x = covariateValues[covariateToCorrect];
            double[] rc = Regression.getLinearRegressionCoefficients(x, y);
            double correlation = ArrayMath.correlation((double[])x, (double[])y);
            double propExplainedVarianceTrait = correlation * correlation - 1.0 / (double)y.length;
            if (propExplainedVarianceTrait < 0.0) {
                propExplainedVarianceTrait = 0.0;
            }
            double[] rawDataUpdated = new double[x.length];
            for (int s = 0; s < x.length; ++s) {
                double residual;
                rawDataUpdated[s] = residual = y[s] - x[s] * rc[0];
            }
            double meanUpdated = ArrayMath.mean((double[])rawDataUpdated);
            double stdDevRatio = ArrayMath.standardDeviation((double[])rawDataUpdated) / Math.sqrt(varianceY);
            int s = 0;
            while (s < x.length) {
                int n = s;
                rawDataUpdated[n] = rawDataUpdated[n] - meanUpdated;
                int n2 = s;
                rawDataUpdated[n2] = rawDataUpdated[n2] / stdDevRatio;
                int n3 = s++;
                rawDataUpdated[n3] = rawDataUpdated[n3] + meanY;
            }
            System.arraycopy(rawDataUpdated, 0, rawdata[probe], 0, x.length);
        }
    }

    private Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>> loadCovariateValues(String covariatesToRemove, DoubleMatrixDataset<String, String> dataset) throws IOException {
        int col;
        int row;
        System.out.println("- Removing covariates as defined in: " + covariatesToRemove);
        TextFile covariates = new TextFile(covariatesToRemove, false);
        int numRows = covariates.countLines() - 1;
        int numCols = covariates.countCols(TextFile.tab) - 1;
        if (numRows == 0 || numCols == 0) {
            System.err.println("Covariate file is empty, but no covariates found in file! Is your file format correct?");
            System.err.println("The program is expecting the following: tab separated, one covariate per row, one sample per column, with sample identifiers identical to your --in file.");
            System.exit(0);
        } else {
            System.out.println("Covariate file has " + numRows + " rows and " + numCols + " columns");
        }
        HashMap<String, Integer> samplesInDatasetIndex = new HashMap<String, Integer>();
        String[] allSamplesInDataset = dataset.colObjects.toArray(new String[0]);
        for (int i = 0; i < allSamplesInDataset.length; ++i) {
            samplesInDatasetIndex.put(allSamplesInDataset[i], i);
        }
        String[] elems = covariates.readLineElemsReturnReference(TextFile.tab);
        int ctr = 0;
        boolean[] sampleInDatasetIncludedInCovariates = new boolean[dataset.colObjects.size()];
        ArrayList<String> columnNames = new ArrayList<String>();
        for (int i = 1; i < elems.length; ++i) {
            Integer index = (Integer)samplesInDatasetIndex.get(elems[i]);
            columnNames.add(elems[i]);
            if (index == null) continue;
            sampleInDatasetIncludedInCovariates[index.intValue()] = true;
            ++ctr;
        }
        ArrayList<String> rowNames = new ArrayList<String>();
        elems = covariates.readLineElemsReturnReference(TextFile.tab);
        while (elems != null) {
            rowNames.add(elems[0]);
            elems = covariates.readLineElemsReturnReference(TextFile.tab);
        }
        covariates.close();
        boolean isTransposed = false;
        if (ctr == 0) {
            System.err.println("No matching samples detected between covariate file and dataset. Maybe your covariate file needs to be transposed? Will test that for you now:");
            for (String rowName : rowNames) {
                Integer index = (Integer)samplesInDatasetIndex.get(rowName);
                if (index == null) continue;
                sampleInDatasetIncludedInCovariates[index.intValue()] = true;
                ++ctr;
            }
            if (ctr == 0) {
                System.err.println("Transposing the data does not seem to resolve the issue. Please check your sample identifiers.");
                System.exit(0);
            } else {
                System.out.println("Transposing the covariate file reveals: " + ctr + " samples present.");
                isTransposed = true;
            }
        }
        if (ctr < dataset.colObjects.size()) {
            System.err.println("Covariates loaded from: " + covariatesToRemove + ", but not all samples present in covariates file! " + ctr + " present in covariates file, out of " + dataset.colObjects.size() + " in dataset...");
            System.out.println("Your dataset will be adjusted accordingly.");
        }
        int nrCovariates = numRows;
        if (isTransposed) {
            nrCovariates = numCols;
        }
        double[][] covariateValues = new double[nrCovariates][dataset.colObjects.size()];
        for (int row2 = 0; row2 < covariateValues.length; ++row2) {
            for (int col2 = 0; col2 < covariateValues[row2].length; ++col2) {
                covariateValues[row2][col2] = Double.NaN;
            }
        }
        int lineCtr = 0;
        covariates.open();
        String[] headerElems = covariates.readLineElemsReturnReference(TextFile.tab);
        elems = covariates.readLineElemsReturnReference(TextFile.tab);
        while (elems != null) {
            if (isTransposed) {
                String sampleName = elems[0];
                Integer sampleIdInDataset = (Integer)samplesInDatasetIndex.get(sampleName);
                if (sampleIdInDataset != null) {
                    for (int i = 1; i < elems.length; ++i) {
                        try {
                            covariateValues[i - 1][sampleIdInDataset.intValue()] = Double.parseDouble(elems[i]);
                            continue;
                        }
                        catch (NumberFormatException e) {
                            // empty catch block
                        }
                    }
                }
            } else {
                for (int i = 1; i < elems.length; ++i) {
                    String sampleName = headerElems[i];
                    Integer sampleIdInDataset = (Integer)samplesInDatasetIndex.get(sampleName);
                    if (sampleIdInDataset == null) continue;
                    try {
                        covariateValues[lineCtr][sampleIdInDataset.intValue()] = Double.parseDouble(elems[i]);
                        continue;
                    }
                    catch (NumberFormatException e) {
                        // empty catch block
                    }
                }
            }
            elems = covariates.readLineElemsReturnReference(TextFile.tab);
            ++lineCtr;
        }
        covariates.close();
        int covariateCtr = 0;
        boolean[] includeCovariate = new boolean[covariateValues.length];
        for (int row3 = 0; row3 < covariateValues.length; ++row3) {
            int nrColsFilled = 0;
            for (int col3 = 0; col3 < covariateValues[row3].length; ++col3) {
                if (Double.isNaN(covariateValues[row3][col3])) continue;
                ++nrColsFilled;
            }
            if (nrColsFilled == 0) {
                includeCovariate[row3] = false;
                continue;
            }
            includeCovariate[row3] = true;
            ++covariateCtr;
        }
        if (covariateCtr == 0) {
            System.err.println("ERROR: none of your covariates seem to have valid numerical values.. Please check your covariate file.");
            System.exit(0);
        } else {
            System.out.println("After removing covariates without data, your dataset will have " + covariateCtr + " covariates (out of: " + covariateValues.length + ") .");
        }
        ArrayList<String> covariateNames = null;
        covariateNames = isTransposed ? columnNames : rowNames;
        if (covariateCtr != covariateValues.length) {
            System.out.println("Removing covariates that have no data at all.");
            double[][] newCovariateData = new double[covariateCtr][dataset.colObjects.size()];
            ArrayList newCovariateNames = new ArrayList();
            int newCovariateCTR = 0;
            for (int row4 = 0; row4 < covariateValues.length; ++row4) {
                if (includeCovariate[row4]) {
                    newCovariateNames.add(covariateNames.get(row4));
                    for (int col4 = 0; col4 < covariateValues[row4].length; ++col4) {
                        newCovariateData[newCovariateCTR][col4] = covariateValues[row4][col4];
                        if (!Double.isNaN(covariateValues[row4][col4])) continue;
                        sampleInDatasetIncludedInCovariates[col4] = false;
                    }
                    ++newCovariateCTR;
                    continue;
                }
                System.out.println((String)covariateNames.get(row4) + " removed.");
            }
            nrCovariates = newCovariateCTR;
            covariateValues = newCovariateData;
            covariateNames = newCovariateNames;
        }
        System.out.println("");
        System.out.println("Remaining covariates: ");
        for (String s : covariateNames) {
            System.out.println(s);
        }
        System.out.println("");
        for (int row5 = 0; row5 < covariateValues.length; ++row5) {
            for (int col5 = 0; col5 < covariateValues[row5].length; ++col5) {
                if (!Double.isNaN(covariateValues[row5][col5])) continue;
                sampleInDatasetIncludedInCovariates[col5] = false;
            }
        }
        int sampleCtr = 0;
        for (int q = 0; q < sampleInDatasetIncludedInCovariates.length; ++q) {
            if (!sampleInDatasetIncludedInCovariates[q]) continue;
            ++sampleCtr;
        }
        System.out.println("Your covariate corrected dataset will have " + sampleCtr + " samples, after removing samples with missing covariate values.");
        double[][] rawData = dataset.getRawData();
        double[][] newRawData = new double[rawData.length][sampleCtr];
        double[][] finalCovariateData = new double[nrCovariates][sampleCtr];
        ArrayList newColObjects = new ArrayList();
        for (int col6 = 0; col6 < dataset.colObjects.size(); ++col6) {
            if (!sampleInDatasetIncludedInCovariates[col6]) continue;
            newColObjects.add(dataset.colObjects.get(col6));
        }
        for (row = 0; row < rawData.length; ++row) {
            int includedSampleCtr = 0;
            for (col = 0; col < dataset.colObjects.size(); ++col) {
                if (!sampleInDatasetIncludedInCovariates[col]) continue;
                newRawData[row][includedSampleCtr] = rawData[row][col];
                ++includedSampleCtr;
            }
        }
        for (row = 0; row < covariateValues.length; ++row) {
            int includedCovariateSampleCtr = 0;
            for (col = 0; col < dataset.colObjects.size(); ++col) {
                if (!sampleInDatasetIncludedInCovariates[col]) continue;
                finalCovariateData[row][includedCovariateSampleCtr] = covariateValues[row][col];
                ++includedCovariateSampleCtr;
            }
        }
        DoubleMatrixDataset covariateDataset = new DoubleMatrixDataset(finalCovariateData, covariateNames, newColObjects);
        covariateDataset.save(covariatesToRemove + "-asLoadedByNormalizer.txt");
        DoubleMatrixDataset newDataset = new DoubleMatrixDataset(newRawData, dataset.rowObjects, newColObjects);
        newDataset.save(dataset.fileName + "-SampleSizeCorrectedForCovariates.txt");
        return new Pair<DoubleMatrixDataset<String, String>, DoubleMatrixDataset<String, String>>(covariateDataset, newDataset);
    }

    private String removeProbesWithZeroVariance(DoubleMatrixDataset<String, String> dataset, String outputFileNamePrefix) throws IOException {
        boolean[] dataHasZeroVariance = new boolean[dataset.nrRows];
        int nrRowsWithZeroVariance = 0;
        for (int row = 0; row < dataset.nrRows; ++row) {
            double[] data = dataset.rawData[row];
            double var = ArrayMath.variance((double[])data);
            if (var != 0.0) continue;
            System.out.println("Removing probe with zero variance: " + (String)dataset.rowObjects.get(row) + " on line " + (row + 1));
            ++nrRowsWithZeroVariance;
            dataHasZeroVariance[row] = true;
        }
        if (nrRowsWithZeroVariance > 0) {
            int newNrRows = dataset.nrRows - nrRowsWithZeroVariance;
            if (newNrRows == 0) {
                System.err.println("ERROR: all probes have zero variance!");
                System.exit(-1);
            }
            double[][] newData = new double[newNrRows][dataset.nrCols];
            int ctr = 0;
            ArrayList newRowHeader = new ArrayList();
            for (int row = 0; row < dataset.nrRows; ++row) {
                if (dataHasZeroVariance[row]) continue;
                newData[ctr] = dataset.rawData[row];
                newRowHeader.add(dataset.rowObjects.get(row));
                ++ctr;
            }
            dataset.rawData = newData;
            dataset.rowObjects = newRowHeader;
            dataset.recalculateHashMaps();
            String outputFileName = outputFileNamePrefix + ".ProbesWithZeroVarianceRemoved";
            dataset.save(outputFileName + ".txt.gz");
            return outputFileName;
        }
        return outputFileNamePrefix;
    }
}

