/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.topics;

import cc.mallet.topics.ParallelTopicModel;
import cc.mallet.topics.TopicAssignment;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.IDSorter;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelSequence;
import com.carrotsearch.hppc.IntHashSet;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Formatter;
import java.util.Iterator;
import java.util.Locale;
import java.util.TreeSet;

public class TopicModelDiagnostics {
    int numTopics;
    int numTopWords;
    public static final int TWO_PERCENT_INDEX = 1;
    public static final int FIFTY_PERCENT_INDEX = 6;
    public static final double[] DEFAULT_DOC_PROPORTIONS = new double[]{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5};
    ArrayList<TreeSet<IDSorter>> topicSortedWords;
    String[][] topicTopWords;
    ArrayList<TopicScores> diagnostics;
    ParallelTopicModel model;
    Alphabet alphabet;
    int[][][] topicCodocumentMatrices;
    int[] numRank1Documents;
    int[] numNonZeroDocuments;
    int[][] numDocumentsAtProportions;
    double[] sumCountTimesLogCount;
    int[] wordTypeCounts;
    int numTokens = 0;

    public TopicModelDiagnostics(ParallelTopicModel model, int numTopWords) {
        this.numTopics = model.getNumTopics();
        this.numTopWords = numTopWords;
        this.model = model;
        this.alphabet = model.getAlphabet();
        this.topicSortedWords = model.getSortedWords();
        this.topicTopWords = new String[this.numTopics][numTopWords];
        this.numRank1Documents = new int[this.numTopics];
        this.numNonZeroDocuments = new int[this.numTopics];
        this.numDocumentsAtProportions = new int[this.numTopics][DEFAULT_DOC_PROPORTIONS.length];
        this.sumCountTimesLogCount = new double[this.numTopics];
        this.diagnostics = new ArrayList();
        for (int topic = 0; topic < this.numTopics; ++topic) {
            boolean position = false;
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            int limit = numTopWords;
            if (sortedWords.size() < numTopWords) {
                limit = sortedWords.size();
            }
            Iterator<IDSorter> iterator = sortedWords.iterator();
            for (int i = 0; i < limit; ++i) {
                IDSorter info = iterator.next();
                this.topicTopWords[topic][i] = (String)this.alphabet.lookupObject(info.getID());
            }
        }
        this.collectDocumentStatistics();
        this.diagnostics.add(this.getTokensPerTopic(model.tokensPerTopic));
        this.diagnostics.add(this.getDocumentEntropy(model.tokensPerTopic));
        this.diagnostics.add(this.getWordLengthScores());
        this.diagnostics.add(this.getCoherence());
        this.diagnostics.add(this.getDistanceFromUniform());
        this.diagnostics.add(this.getDistanceFromCorpus());
        this.diagnostics.add(this.getEffectiveNumberOfWords());
        this.diagnostics.add(this.getTokenDocumentDiscrepancies());
        this.diagnostics.add(this.getRank1Percent());
        this.diagnostics.add(this.getDocumentPercentRatio(6, 1));
        this.diagnostics.add(this.getDocumentPercent(5));
        this.diagnostics.add(this.getExclusivity());
    }

    public void collectDocumentStatistics() {
        this.topicCodocumentMatrices = new int[this.numTopics][this.numTopWords][this.numTopWords];
        this.wordTypeCounts = new int[this.alphabet.size()];
        this.numTokens = 0;
        IntHashSet[] topicTopWordIndices = new IntHashSet[this.numTopics];
        int[][] topicWordIndicesInOrder = new int[this.numTopics][this.numTopWords];
        IntHashSet[] docTopicWordIndices = new IntHashSet[this.numTopics];
        int numDocs = this.model.getData().size();
        int[] topicCounts = new int[this.numTopics];
        for (int topic = 0; topic < this.numTopics; ++topic) {
            IntHashSet wordIndices = new IntHashSet();
            for (int i = 0; i < this.numTopWords; ++i) {
                int type;
                if (this.topicTopWords[topic][i] == null) continue;
                topicWordIndicesInOrder[topic][i] = type = this.alphabet.lookupIndex(this.topicTopWords[topic][i]);
                wordIndices.add(type);
            }
            topicTopWordIndices[topic] = wordIndices;
            docTopicWordIndices[topic] = new IntHashSet();
        }
        int doc = 0;
        for (TopicAssignment document : this.model.getData()) {
            FeatureSequence tokens = (FeatureSequence)document.instance.getData();
            LabelSequence topics = document.topicSequence;
            for (int position = 0; position < tokens.size(); ++position) {
                int type = tokens.getIndexAtPosition(position);
                int topic = topics.getIndexAtPosition(position);
                ++this.numTokens;
                int n = type;
                this.wordTypeCounts[n] = this.wordTypeCounts[n] + 1;
                int n2 = topic;
                topicCounts[n2] = topicCounts[n2] + 1;
                if (!topicTopWordIndices[topic].contains(type)) continue;
                docTopicWordIndices[topic].add(type);
            }
            int docLength = tokens.size();
            if (docLength > 0) {
                int maxTopic = -1;
                int maxCount = -1;
                for (int topic = 0; topic < this.numTopics; ++topic) {
                    if (topicCounts[topic] <= 0) continue;
                    int n = topic;
                    this.numNonZeroDocuments[n] = this.numNonZeroDocuments[n] + 1;
                    if (topicCounts[topic] > maxCount) {
                        maxTopic = topic;
                        maxCount = topicCounts[topic];
                    }
                    int n3 = topic;
                    this.sumCountTimesLogCount[n3] = this.sumCountTimesLogCount[n3] + (double)topicCounts[topic] * Math.log(topicCounts[topic]);
                    double proportion = (this.model.alpha[topic] + (double)topicCounts[topic]) / (this.model.alphaSum + (double)docLength);
                    int i = 0;
                    while (i < DEFAULT_DOC_PROPORTIONS.length && !(proportion < DEFAULT_DOC_PROPORTIONS[i])) {
                        int[] nArray = this.numDocumentsAtProportions[topic];
                        int n4 = i++;
                        nArray[n4] = nArray[n4] + 1;
                    }
                    IntHashSet supportedWords = docTopicWordIndices[topic];
                    int[] indices = topicWordIndicesInOrder[topic];
                    for (int i2 = 0; i2 < this.numTopWords; ++i2) {
                        if (!supportedWords.contains(indices[i2])) continue;
                        for (int j = i2; j < this.numTopWords; ++j) {
                            if (i2 == j) {
                                int[] nArray = this.topicCodocumentMatrices[topic][i2];
                                int n5 = i2;
                                nArray[n5] = nArray[n5] + 1;
                                continue;
                            }
                            if (!supportedWords.contains(indices[j])) continue;
                            int[] nArray = this.topicCodocumentMatrices[topic][i2];
                            int n6 = j;
                            nArray[n6] = nArray[n6] + 1;
                            int[] nArray2 = this.topicCodocumentMatrices[topic][j];
                            int n7 = i2;
                            nArray2[n7] = nArray2[n7] + 1;
                        }
                    }
                    docTopicWordIndices[topic].clear();
                    topicCounts[topic] = 0;
                }
                if (maxTopic > -1) {
                    int n = maxTopic;
                    this.numRank1Documents[n] = this.numRank1Documents[n] + 1;
                }
            }
            ++doc;
        }
    }

    public int[][] getCodocumentMatrix(int topic) {
        return this.topicCodocumentMatrices[topic];
    }

    public TopicScores getTokensPerTopic(int[] tokensPerTopic) {
        TopicScores scores = new TopicScores("tokens", this.numTopics, this.numTopWords);
        for (int topic = 0; topic < this.numTopics; ++topic) {
            scores.setTopicScore(topic, tokensPerTopic[topic]);
        }
        return scores;
    }

    public TopicScores getDocumentEntropy(int[] tokensPerTopic) {
        TopicScores scores = new TopicScores("document_entropy", this.numTopics, this.numTopWords);
        for (int topic = 0; topic < this.numTopics; ++topic) {
            scores.setTopicScore(topic, -this.sumCountTimesLogCount[topic] / (double)tokensPerTopic[topic] + Math.log(tokensPerTopic[topic]));
        }
        return scores;
    }

    public TopicScores getDistanceFromUniform() {
        int[] tokensPerTopic = this.model.tokensPerTopic;
        TopicScores scores = new TopicScores("uniform_dist", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        int numTypes = this.alphabet.size();
        for (int topic = 0; topic < this.numTopics; ++topic) {
            double topicScore = 0.0;
            int position = 0;
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            for (IDSorter info : sortedWords) {
                int type = info.getID();
                double count = info.getWeight();
                double score = count / (double)tokensPerTopic[topic] * Math.log(count * (double)numTypes / (double)tokensPerTopic[topic]);
                if (position < this.numTopWords) {
                    scores.setTopicWordScore(topic, position, score);
                }
                topicScore += score;
                ++position;
            }
            scores.setTopicScore(topic, topicScore);
        }
        return scores;
    }

    public TopicScores getEffectiveNumberOfWords() {
        int[] tokensPerTopic = this.model.tokensPerTopic;
        TopicScores scores = new TopicScores("eff_num_words", this.numTopics, this.numTopWords);
        int numTypes = this.alphabet.size();
        for (int topic = 0; topic < this.numTopics; ++topic) {
            double sumSquaredProbabilities = 0.0;
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            for (IDSorter info : sortedWords) {
                int type = info.getID();
                double probability = info.getWeight() / (double)tokensPerTopic[topic];
                sumSquaredProbabilities += probability * probability;
            }
            scores.setTopicScore(topic, 1.0 / sumSquaredProbabilities);
        }
        return scores;
    }

    public TopicScores getDistanceFromCorpus() {
        int[] tokensPerTopic = this.model.tokensPerTopic;
        TopicScores scores = new TopicScores("corpus_dist", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            double coefficient = (double)this.numTokens / (double)tokensPerTopic[topic];
            double topicScore = 0.0;
            int position = 0;
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            for (IDSorter info : sortedWords) {
                int type = info.getID();
                double count = info.getWeight();
                double score = count / (double)tokensPerTopic[topic] * Math.log(coefficient * count / (double)this.wordTypeCounts[type]);
                if (position < this.numTopWords) {
                    scores.setTopicWordScore(topic, position, score);
                }
                topicScore += score;
                ++position;
            }
            scores.setTopicScore(topic, topicScore);
        }
        return scores;
    }

    public TopicScores getTokenDocumentDiscrepancies() {
        TopicScores scores = new TopicScores("token-doc-diff", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            int position;
            int[][] matrix = this.topicCodocumentMatrices[topic];
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            double topicScore = 0.0;
            double[] wordDistribution = new double[this.numTopWords];
            double[] docDistribution = new double[this.numTopWords];
            double wordSum = 0.0;
            double docSum = 0.0;
            Iterator<IDSorter> iterator = sortedWords.iterator();
            for (position = 0; iterator.hasNext() && position < this.numTopWords; ++position) {
                IDSorter info = iterator.next();
                wordDistribution[position] = info.getWeight();
                docDistribution[position] = matrix[position][position];
                wordSum += wordDistribution[position];
                docSum += docDistribution[position];
            }
            for (position = 0; position < this.numTopWords; ++position) {
                double p = wordDistribution[position] / wordSum;
                double q = docDistribution[position] / docSum;
                double meanProb = 0.5 * (p + q);
                double score = 0.0;
                if (p > 0.0) {
                    score += 0.5 * p * Math.log(p / meanProb);
                }
                if (q > 0.0) {
                    score += 0.5 * q * Math.log(q / meanProb);
                }
                scores.setTopicWordScore(topic, position, score);
                topicScore += score;
            }
            scores.setTopicScore(topic, topicScore);
        }
        return scores;
    }

    public TopicScores getWordLengthScores() {
        TopicScores scores = new TopicScores("word-length", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            int total = 0;
            for (int position = 0; position < this.topicTopWords[topic].length && this.topicTopWords[topic][position] != null; ++position) {
                int length = this.topicTopWords[topic][position].length();
                total += length;
                scores.setTopicWordScore(topic, position, length);
            }
            scores.setTopicScore(topic, (double)total / (double)this.topicTopWords[topic].length);
        }
        return scores;
    }

    public TopicScores getWordLengthStandardDeviation() {
        TopicScores scores = new TopicScores("word-length-sd", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        double meanLength = 0.0;
        int totalWords = 0;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            for (int position = 0; position < this.topicTopWords[topic].length && this.topicTopWords[topic][position] != null; ++position) {
                meanLength += (double)this.topicTopWords[topic][position].length();
                ++totalWords;
            }
        }
        meanLength /= (double)totalWords;
        double lengthVariance = 0.0;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            for (int position = 0; position < this.topicTopWords[topic].length && this.topicTopWords[topic][position] != null; ++position) {
                int length = this.topicTopWords[topic][position].length();
                lengthVariance += ((double)length - meanLength) * ((double)length - meanLength);
            }
        }
        double lengthSD = Math.sqrt(lengthVariance /= (double)(totalWords - 1));
        for (int topic = 0; topic < this.numTopics; ++topic) {
            for (int position = 0; position < this.topicTopWords[topic].length && this.topicTopWords[topic][position] != null; ++position) {
                int length = this.topicTopWords[topic][position].length();
                scores.addToTopicScore(topic, ((double)length - meanLength) / lengthSD);
                scores.setTopicWordScore(topic, position, ((double)length - meanLength) / lengthSD);
            }
        }
        return scores;
    }

    public TopicScores getCoherence() {
        TopicScores scores = new TopicScores("coherence", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        for (int topic = 0; topic < this.numTopics; ++topic) {
            int[][] matrix = this.topicCodocumentMatrices[topic];
            double topicScore = 0.0;
            for (int row = 0; row < this.numTopWords; ++row) {
                double rowScore = 0.0;
                double minScore = 0.0;
                for (int col = 0; col < row; ++col) {
                    double score = Math.log(((double)matrix[row][col] + this.model.beta) / ((double)matrix[col][col] + this.model.beta));
                    rowScore += score;
                    if (!(score < minScore)) continue;
                    minScore = score;
                }
                topicScore += rowScore;
                scores.setTopicWordScore(topic, row, minScore);
            }
            scores.setTopicScore(topic, topicScore);
        }
        return scores;
    }

    public TopicScores getRank1Percent() {
        TopicScores scores = new TopicScores("rank_1_docs", this.numTopics, this.numTopWords);
        for (int topic = 0; topic < this.numTopics; ++topic) {
            scores.setTopicScore(topic, (double)this.numRank1Documents[topic] / (double)this.numNonZeroDocuments[topic]);
        }
        return scores;
    }

    public TopicScores getDocumentPercentRatio(int numeratorIndex, int denominatorIndex) {
        TopicScores scores = new TopicScores("allocation_ratio", this.numTopics, this.numTopWords);
        if (numeratorIndex > this.numDocumentsAtProportions[0].length || denominatorIndex > this.numDocumentsAtProportions[0].length) {
            System.err.println("Invalid proportion indices (max " + (this.numDocumentsAtProportions[0].length - 1) + ") : " + numeratorIndex + ", " + denominatorIndex);
            return scores;
        }
        for (int topic = 0; topic < this.numTopics; ++topic) {
            scores.setTopicScore(topic, (double)this.numDocumentsAtProportions[topic][numeratorIndex] / (double)this.numDocumentsAtProportions[topic][denominatorIndex]);
        }
        return scores;
    }

    public TopicScores getDocumentPercent(int i) {
        TopicScores scores = new TopicScores("allocation_count", this.numTopics, this.numTopWords);
        if (i > this.numDocumentsAtProportions[0].length) {
            System.err.println("Invalid proportion indices (max " + (this.numDocumentsAtProportions[0].length - 1) + ") : " + i);
            return scores;
        }
        for (int topic = 0; topic < this.numTopics; ++topic) {
            scores.setTopicScore(topic, (double)this.numDocumentsAtProportions[topic][i] / (double)this.numNonZeroDocuments[topic]);
        }
        return scores;
    }

    public TopicScores getExclusivity() {
        int topic;
        int[] tokensPerTopic = this.model.tokensPerTopic;
        TopicScores scores = new TopicScores("exclusivity", this.numTopics, this.numTopWords);
        scores.wordScoresDefined = true;
        double sumDefaultProbs = 0.0;
        for (topic = 0; topic < this.numTopics; ++topic) {
            sumDefaultProbs += this.model.beta / (this.model.betaSum + (double)tokensPerTopic[topic]);
        }
        for (topic = 0; topic < this.numTopics; ++topic) {
            double topicScore = 0.0;
            int position = 0;
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            for (IDSorter info : sortedWords) {
                int type = info.getID();
                double count = info.getWeight();
                double sumTypeProbs = sumDefaultProbs;
                int[] topicCounts = this.model.typeTopicCounts[type];
                for (int index = 0; index < topicCounts.length && topicCounts[index] > 0; ++index) {
                    int otherTopic = topicCounts[index] & this.model.topicMask;
                    int otherCount = topicCounts[index] >> this.model.topicBits;
                    sumTypeProbs += (double)otherCount / (this.model.betaSum + (double)tokensPerTopic[otherTopic]);
                }
                double score = (this.model.beta + count) / (this.model.betaSum + (double)tokensPerTopic[topic]) / sumTypeProbs;
                scores.setTopicWordScore(topic, position, score);
                topicScore += score;
                if (++position != this.numTopWords) continue;
                break;
            }
            scores.setTopicScore(topic, topicScore / (double)this.numTopWords);
        }
        return scores;
    }

    public String toString() {
        StringBuilder out = new StringBuilder();
        Formatter formatter = new Formatter(out, Locale.US);
        for (int topic = 0; topic < this.numTopics; ++topic) {
            formatter.format("Topic %d", topic);
            for (TopicScores scores : this.diagnostics) {
                formatter.format("\t%s=%.4f", scores.name, scores.scores[topic]);
            }
            formatter.format("\n", new Object[0]);
            for (int position = 0; position < this.topicTopWords[topic].length && this.topicTopWords[topic][position] != null; ++position) {
                formatter.format("  %s", this.topicTopWords[topic][position]);
                for (TopicScores scores : this.diagnostics) {
                    if (!scores.wordScoresDefined) continue;
                    formatter.format("\t%s=%.4f", scores.name, scores.topicWordScores[topic][position]);
                }
                out.append("\n");
            }
        }
        return out.toString();
    }

    public String toXML() {
        int[] tokensPerTopic = this.model.tokensPerTopic;
        StringBuilder out = new StringBuilder();
        Formatter formatter = new Formatter(out, Locale.US);
        out.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        out.append("<model>\n");
        for (int topic = 0; topic < this.numTopics; ++topic) {
            int[][] matrix = this.topicCodocumentMatrices[topic];
            formatter.format("<topic id='%d'", topic);
            for (TopicScores scores : this.diagnostics) {
                formatter.format(" %s='%.4f'", scores.name, scores.scores[topic]);
            }
            out.append(">\n");
            TreeSet<IDSorter> sortedWords = this.topicSortedWords.get(topic);
            int limit = this.numTopWords;
            if (sortedWords.size() < this.numTopWords) {
                limit = sortedWords.size();
            }
            double cumulativeProbability = 0.0;
            Iterator<IDSorter> iterator = sortedWords.iterator();
            for (int position = 0; position < limit; ++position) {
                IDSorter info = iterator.next();
                double probability = info.getWeight() / (double)tokensPerTopic[topic];
                formatter.format("<word rank='%d' count='%.0f' prob='%.5f' cumulative='%.5f' docs='%d'", position + 1, info.getWeight(), probability, cumulativeProbability += probability, matrix[position][position]);
                for (TopicScores scores : this.diagnostics) {
                    if (!scores.wordScoresDefined) continue;
                    formatter.format(" %s='%.4f'", scores.name, scores.topicWordScores[topic][position]);
                }
                formatter.format(">%s</word>\n", this.topicTopWords[topic][position].replaceAll("&", "&amp;").replaceAll("<", "&gt;"));
            }
            out.append("</topic>\n");
        }
        out.append("</model>\n");
        return out.toString();
    }

    public static void main(String[] args) throws Exception {
        InstanceList instances = InstanceList.load(new File(args[0]));
        int numTopics = Integer.parseInt(args[1]);
        ParallelTopicModel model = new ParallelTopicModel(numTopics, 5.0, 0.01);
        model.addInstances(instances);
        model.setNumIterations(1000);
        model.estimate();
        TopicModelDiagnostics diagnostics = new TopicModelDiagnostics(model, 20);
        if (args.length == 3) {
            PrintWriter out = new PrintWriter(args[2]);
            out.println(diagnostics.toXML());
            out.close();
        }
    }

    public class TopicScores {
        public String name;
        public double[] scores;
        public double[][] topicWordScores;
        public boolean wordScoresDefined = false;

        public TopicScores(String name, int numTopics, int numWords) {
            this.name = name;
            this.scores = new double[numTopics];
            this.topicWordScores = new double[numTopics][numWords];
        }

        public void setTopicScore(int topic, double score) {
            this.scores[topic] = score;
        }

        public void addToTopicScore(int topic, double score) {
            int n = topic;
            this.scores[n] = this.scores[n] + score;
        }

        public void setTopicWordScore(int topic, int wordPosition, double score) {
            this.topicWordScores[topic][wordPosition] = score;
            this.wordScoresDefined = true;
        }
    }
}

