package jsat.text;

import com.itextpdf.text.pdf.PdfObject;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.datatransform.RemoveAttributeTransform;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;
import jsat.utils.IntSet;

/* loaded from: input_file:JSAT-0.0.7.jar:jsat/text/TextDataLoader.class */
public abstract class TextDataLoader implements TextVectorCreator {
    private static final long serialVersionUID = -657253682338792871L;
    protected Tokenizer tokenizer;
    private WordWeighting weighting;
    private TextVectorCreator tvc;
    private volatile int documents;
    private final AtomicInteger currentLength = new AtomicInteger(0);
    protected final List<SparseVector> vectors = new ArrayList();
    protected ConcurrentHashMap<String, Integer> wordIndex = new ConcurrentHashMap<>();
    protected ConcurrentHashMap<Integer, AtomicInteger> termDocumentFrequencys = new ConcurrentHashMap<>();
    protected List<String> allWords = new ArrayList();
    protected boolean noMoreAdding = false;
    protected ThreadLocal<StringBuilder> workSpace = new ThreadLocal<>();
    protected ThreadLocal<List<String>> storageSpace = new ThreadLocal<>();
    protected ThreadLocal<Map<String, Integer>> wordCounts = new ThreadLocal<>();

    public TextDataLoader(Tokenizer tokenizer, WordWeighting wordWeighting) {
        this.tokenizer = tokenizer;
        this.weighting = wordWeighting;
    }

    public abstract void initialLoad();

    /* JADX INFO: Access modifiers changed from: protected */
    public int addOriginalDocument(String str) {
        int i;
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        StringBuilder sb = this.workSpace.get();
        List<String> list = this.storageSpace.get();
        Map<String, Integer> map = this.wordCounts.get();
        if (sb == null) {
            sb = new StringBuilder();
            list = new ArrayList();
            map = new LinkedHashMap();
            this.workSpace.set(sb);
            this.storageSpace.set(list);
            this.wordCounts.set(map);
        }
        sb.setLength(0);
        list.clear();
        map.clear();
        this.tokenizer.tokenize(str, sb, list);
        for (String str2 : list) {
            Integer num = map.get(str2);
            if (num == null) {
                map.put(str2, 1);
            } else {
                map.put(str2, Integer.valueOf(num.intValue() + 1));
            }
        }
        SparseVector sparseVector = new SparseVector(this.currentLength.get() + 1, map.size());
        for (Map.Entry<String, Integer> entry : map.entrySet()) {
            String key = entry.getKey();
            int i2 = 1;
            while (!addWord(key, sparseVector, entry.getValue())) {
                try {
                    Thread.sleep(i2);
                    i2 = Math.min(100, i2 * 2);
                } catch (InterruptedException e) {
                    Logger.getLogger(TextDataLoader.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e);
                }
            }
        }
        map.clear();
        synchronized (this.vectors) {
            this.vectors.add(sparseVector);
            i = this.documents;
            this.documents = i + 1;
        }
        return i;
    }

    private boolean addWord(String str, SparseVector sparseVector, Integer num) {
        Integer num2 = this.wordIndex.get(str);
        if (num2 != null) {
            if (num2.intValue() < 0) {
                return false;
            }
            AtomicInteger atomicInteger = this.termDocumentFrequencys.get(num2);
            if (atomicInteger == null) {
                atomicInteger = this.termDocumentFrequencys.putIfAbsent(num2, new AtomicInteger(1));
                if (atomicInteger == null) {
                    atomicInteger = this.termDocumentFrequencys.get(num2);
                }
            }
            atomicInteger.incrementAndGet();
            if (sparseVector.length() <= num2.intValue()) {
                sparseVector.setLength(num2.intValue() + 1);
            }
            sparseVector.set(num2.intValue(), num.intValue());
            return true;
        }
        Integer putIfAbsent = this.wordIndex.putIfAbsent(str, -1);
        Integer num3 = putIfAbsent;
        if (putIfAbsent == null) {
            num3 = Integer.valueOf(this.currentLength.getAndIncrement());
            this.wordIndex.put(str, num3);
        }
        if (num3.intValue() < 0) {
            return false;
        }
        AtomicInteger atomicInteger2 = new AtomicInteger(0);
        AtomicInteger putIfAbsent2 = this.termDocumentFrequencys.putIfAbsent(num3, atomicInteger2);
        if (putIfAbsent2 != null) {
            atomicInteger2 = putIfAbsent2;
        }
        atomicInteger2.incrementAndGet();
        sparseVector.setLength(Math.max(num3.intValue() + 1, sparseVector.length()));
        sparseVector.set(num3.intValue(), num.intValue());
        return true;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        int i = this.currentLength.get();
        int[] iArr = new int[i];
        for (Map.Entry<Integer, AtomicInteger> entry : this.termDocumentFrequencys.entrySet()) {
            iArr[entry.getKey().intValue()] = entry.getValue().get();
        }
        Iterator<SparseVector> it = this.vectors.iterator();
        while (it.hasNext()) {
            it.next().setLength(i);
        }
        this.weighting.setWeight(this.vectors, IntList.view(iArr, i));
        System.out.println("Final Length: " + i);
        Iterator<SparseVector> it2 = this.vectors.iterator();
        while (it2.hasNext()) {
            this.weighting.applyTo(it2.next());
        }
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            initialLoad();
            finishAdding();
        }
        ArrayList arrayList = new ArrayList(this.vectors.size());
        Iterator<SparseVector> it = this.vectors.iterator();
        while (it.hasNext()) {
            arrayList.add(new DataPoint(it.next(), new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(arrayList);
    }

    @Override // jsat.text.TextVectorCreator
    public Vec newText(String str) {
        if (this.noMoreAdding) {
            return getTextVectorCreator().newText(str);
        }
        throw new RuntimeException("Initial documents have not yet loaded");
    }

    @Override // jsat.text.TextVectorCreator
    public Vec newText(String str, StringBuilder sb, List<String> list) {
        if (this.noMoreAdding) {
            return getTextVectorCreator().newText(str, sb, list);
        }
        throw new RuntimeException("Initial documents have not yet loaded");
    }

    public TextVectorCreator getTextVectorCreator() {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        if (this.tvc == null) {
            this.tvc = new BasicTextVectorCreator(this.tokenizer, this.wordIndex, this.weighting);
        }
        return this.tvc;
    }

    public String getWordForIndex(int i) {
        if (this.allWords.size() != this.wordIndex.size()) {
            while (this.allWords.size() < this.wordIndex.size()) {
                this.allWords.add(PdfObject.NOTHING);
            }
            for (Map.Entry<String, Integer> entry : this.wordIndex.entrySet()) {
                this.allWords.set(entry.getValue().intValue(), entry.getKey());
            }
        }
        if (i < 0 || i >= this.allWords.size()) {
            return null;
        }
        return this.allWords.get(i);
    }

    public int getTermFrequency(int i) {
        return this.termDocumentFrequencys.get(Integer.valueOf(i)).get();
    }

    public RemoveAttributeTransform getMinimumOccurrenceDTF(int i) {
        IntSet intSet = new IntSet();
        for (int i2 = 0; i2 < this.termDocumentFrequencys.size(); i2++) {
            if (this.termDocumentFrequencys.get(Integer.valueOf(i2)).get() < i) {
                intSet.add((IntSet) Integer.valueOf(i2));
            }
        }
        return new RemoveAttributeTransform(Collections.EMPTY_SET, intSet);
    }
}
