/*
 * Decompiled with CFR 0.152.
 */
package io.github.javpower.vectorex.keynote.tfidf;

import io.github.javpower.vectorex.keynote.analysis.ScoredEntity;
import io.github.javpower.vectorex.keynote.analysis.SegMode;
import io.github.javpower.vectorex.keynote.analysis.SegToken;
import io.github.javpower.vectorex.keynote.analysis.TextSegmenter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class TFIDF {
    private static Map<String, Double> idfMap;
    private static Set<String> stopWordsSet;
    private static double idfMedian;

    public List<ScoredEntity> analyze(String content, int topN) {
        this.initializeResources();
        Map<String, Double> tfMap = this.calculateTermFrequency(content);
        List<ScoredEntity> keywordList = this.calculateTFIDFKeywords(tfMap);
        keywordList.sort((k1, k2) -> Double.compare(k2.getScore(), k1.getScore()));
        return keywordList.size() > topN ? keywordList.subList(0, topN) : keywordList;
    }

    private void initializeResources() {
        if (stopWordsSet == null) {
            stopWordsSet = new HashSet<String>();
            this.loadStopWords(stopWordsSet, this.getClass().getResourceAsStream("/stop_words.txt"));
        }
        if (idfMap == null) {
            idfMap = new HashMap<String, Double>();
            this.loadIDFMap(idfMap, this.getClass().getResourceAsStream("/idf_dict.txt"));
        }
    }

    private Map<String, Double> calculateTermFrequency(String content) {
        HashMap<String, Double> tfMap = new HashMap<String, Double>();
        if (content == null || content.trim().isEmpty()) {
            return tfMap;
        }
        TextSegmenter segmenter = new TextSegmenter(SegMode.INDEX);
        List<SegToken> tokens = segmenter.process(content);
        HashMap<String, Integer> freqMap = new HashMap<String, Integer>();
        int wordSum = 0;
        for (SegToken segToken : tokens) {
            String word = segToken.getWord();
            if (stopWordsSet.contains(word) || word.length() <= 1) continue;
            ++wordSum;
            freqMap.put(word, freqMap.getOrDefault(word, 0) + 1);
        }
        for (Map.Entry entry : freqMap.entrySet()) {
            tfMap.put((String)entry.getKey(), (double)((Integer)entry.getValue()).intValue() / (double)wordSum);
        }
        return tfMap;
    }

    private List<ScoredEntity> calculateTFIDFKeywords(Map<String, Double> tfMap) {
        ArrayList<ScoredEntity> keywordList = new ArrayList<ScoredEntity>();
        for (Map.Entry<String, Double> entry : tfMap.entrySet()) {
            String word = entry.getKey();
            double tfValue = entry.getValue();
            double idfValue = idfMap.getOrDefault(word, idfMedian);
            keywordList.add(new ScoredEntity<String>(word, tfValue * idfValue));
        }
        return keywordList;
    }

    private void loadStopWords(Set<String> set, InputStream in) {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(in));){
            String line;
            while ((line = reader.readLine()) != null) {
                set.add(line.trim());
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void loadIDFMap(Map<String, Double> map, InputStream in) {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(in));){
            String line;
            while ((line = reader.readLine()) != null) {
                String[] kv = line.trim().split(" ");
                map.put(kv[0], Double.parseDouble(kv[1]));
            }
            ArrayList<Double> idfList = new ArrayList<Double>(map.values());
            Collections.sort(idfList);
            idfMedian = (Double)idfList.get(idfList.size() / 2);
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }
}

