/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.util.Function;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class DocumentPreprocessor
implements Iterable<List<HasWord>> {
    private Reader inputReader = null;
    private String inputPath = null;
    private DocType docType = DocType.Plain;
    private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
    private String encoding = null;
    private String[] sentenceFinalPuncWords = new String[]{".", "?", "!"};
    private Function<List<HasWord>, List<HasWord>> escaper = null;
    private String sentenceDelimiter = null;
    private String tagDelimiter = null;
    private String elementDelimiter = ".*";
    private final String[] sentenceFinalFollowers = new String[]{")", "]", "\"", "'", "''", "-RRB-", "-RSB-", "-RCB-"};

    public DocumentPreprocessor(Reader input) {
        this(input, DocType.Plain);
    }

    public DocumentPreprocessor(Reader input, DocType t) {
        if (input == null) {
            throw new RuntimeException("Cannot read from null object!");
        }
        this.docType = t;
        this.inputReader = input;
    }

    public DocumentPreprocessor(String docPath) {
        this(docPath, DocType.Plain);
    }

    public DocumentPreprocessor(String docPath, DocType t) {
        if (docPath == null) {
            throw new RuntimeException("Cannot open null document path!");
        }
        this.docType = t;
        this.inputPath = docPath;
    }

    public void setEncoding(String encoding) throws IllegalCharsetNameException {
        if (Charset.isSupported(encoding)) {
            this.encoding = encoding;
        }
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> e) {
        this.escaper = e;
    }

    public void setSentenceDelimiter(String s) {
        this.sentenceDelimiter = s;
    }

    public void setTagDelimiter(String s) {
        this.tagDelimiter = s;
    }

    public void setElementDelimiter(String s) {
        this.elementDelimiter = s;
    }

    @Override
    public Iterator<List<HasWord>> iterator() {
        try {
            if (this.inputReader == null) {
                this.inputReader = this.getReaderFromPath(this.inputPath);
            }
            if (this.docType == DocType.Plain) {
                return new PlainTextIterator();
            }
            if (this.docType == DocType.XML) {
                return new XMLIterator();
            }
        }
        catch (IOException e) {
            System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), this.inputPath);
        }
        return new Iterator<List<HasWord>>(){

            @Override
            public boolean hasNext() {
                return false;
            }

            @Override
            public List<HasWord> next() {
                throw new NoSuchElementException();
            }

            @Override
            public void remove() {
            }
        };
    }

    private Reader getReaderFromPath(String path) throws IOException {
        try {
            URL url = new URL(path);
            URLConnection connection = url.openConnection();
            return new BufferedReader(new InputStreamReader(connection.getInputStream()));
        }
        catch (MalformedURLException e) {
            File file = new File(path);
            if (file.exists()) {
                return this.encoding == null ? new FileReader(path) : new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(file), this.encoding));
            }
            throw new IOException("Unable to open " + path);
        }
    }

    public static void main(String[] args) {
        if (args.length < 1) {
            System.err.println("usage: DocumentPreprocessor filename [OPTS]");
            System.exit(-1);
        }
        DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[0]);
        for (int i = 1; i < args.length; ++i) {
            String options;
            if (args[i].equals("-xml")) {
                docPreprocessor = new DocumentPreprocessor(args[0], DocType.XML);
                docPreprocessor.setTagDelimiter(args[++i]);
                continue;
            }
            if (args[i].equals("-suppressEscaping")) {
                options = "ptb3Escaping=false";
                docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), options));
                continue;
            }
            if (args[i].equals("-tokenizerOptions") && i + 1 < args.length) {
                options = args[i + 1];
                docPreprocessor.setTokenizerFactory(PTBTokenizer.factory(new WordTokenFactory(), options));
                ++i;
                continue;
            }
            if (args[i].equals("-noTokenization")) {
                docPreprocessor.setTokenizerFactory(null);
                docPreprocessor.setSentenceDelimiter(System.getProperty("line.separator"));
                continue;
            }
            if (!args[i].equals("-tag")) continue;
            docPreprocessor.setTagDelimiter(args[++i]);
        }
        docPreprocessor.setEncoding("UTF-8");
        int numSents = 0;
        for (List<HasWord> sentence : docPreprocessor) {
            ++numSents;
            System.err.println("Length: " + sentence.size());
            boolean printSpace = false;
            for (HasWord word : sentence) {
                if (printSpace) {
                    System.out.print(" ");
                }
                printSpace = true;
                System.out.print(word.word());
            }
            System.out.println();
        }
        System.err.println("Read in " + numSents + " sentences.");
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private class XMLIterator
    implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            this.primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                    continue;
                }
                if (this.xmlItr.hasNext()) {
                    String block = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(block));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                        continue;
                    }
                    this.nextSent = null;
                    continue;
                }
                IOUtils.closeIgnoringExceptions(this.originalDocReader);
                this.nextSent = null;
                break;
            } while (this.nextSent == null);
        }

        @Override
        public boolean hasNext() {
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisSentence = this.nextSent;
            this.primeNext();
            return thisSentence;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private class PlainTextIterator
    implements Iterator<List<HasWord>> {
        private Tokenizer<? extends HasWord> tokenizer;
        private Set<String> sentDelims;
        private Set<String> delimFollowers;
        private Function<String, String> splitTag;
        private List<HasWord> nextSent;
        private List<HasWord> nextSentCarryover;

        public PlainTextIterator() {
            this.delimFollowers = new HashSet<String>(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            this.nextSentCarryover = new ArrayList<HasWord>();
            this.sentDelims = new HashSet<String>();
            boolean eolIsSignificant = false;
            if (DocumentPreprocessor.this.sentenceDelimiter == null) {
                if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                    this.sentDelims = new HashSet<String>(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
                }
            } else {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = new HashSet<String>();
                eolIsSignificant = DocumentPreprocessor.this.sentenceDelimiter.matches("\\s+");
                if (eolIsSignificant) {
                    this.sentDelims.add("*NL*");
                }
            }
            if (DocumentPreprocessor.this.tokenizerFactory == null) {
                this.tokenizer = new WhitespaceTokenizer(DocumentPreprocessor.this.inputReader, eolIsSignificant);
            } else {
                if (eolIsSignificant) {
                    DocumentPreprocessor.this.tokenizerFactory.setOptions("tokenizeNLs");
                }
                this.tokenizer = DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader);
            }
            if (DocumentPreprocessor.this.tagDelimiter != null) {
                this.splitTag = new Function<String, String>(){
                    private final String splitRegex;
                    {
                        this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                    }

                    @Override
                    public String apply(String in) {
                        String[] splits = in.split(this.splitRegex);
                        return splits.length > 0 ? splits[0] : in;
                    }
                };
            }
            this.primeNext();
        }

        private void primeNext() {
            this.nextSent = new ArrayList<HasWord>(this.nextSentCarryover);
            this.nextSentCarryover = new ArrayList<HasWord>();
            boolean seenBoundary = false;
            while (this.tokenizer.hasNext()) {
                HasWord token = this.tokenizer.next();
                if (this.splitTag != null) {
                    token.setWord(this.splitTag.apply(token.word()));
                }
                if (this.sentDelims.contains(token.word())) {
                    seenBoundary = true;
                } else if (seenBoundary && !this.delimFollowers.contains(token.word())) {
                    this.nextSentCarryover.add(token);
                    break;
                }
                if (token.word().matches("\\s+") || token.word().equals("*NL*")) continue;
                this.nextSent.add(token);
            }
            if (this.nextSent.size() == 0 && this.nextSentCarryover.size() == 0) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List)DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override
        public boolean hasNext() {
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisIteration = this.nextSent;
            this.primeNext();
            return thisIteration;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static enum DocType {
        Plain,
        XML;

    }
}

