private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch.get(), output); } return builder.finish(); }
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files. * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 File aff = File.createTempFile("affix", "aff", tempDir); OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; try { // copy contents of affix stream to temp file final byte [] buffer = new byte [1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(new FileInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(new FileInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); aff.delete(); } }