瀏覽代碼

同义词插件源码

liwh 1 月之前
當前提交
4eebe58c9a

+ 15 - 0
Dockerfile

@@ -0,0 +1,15 @@
+FROM maven:3.9.2-eclipse-temurin-8-alpine as build
+
+RUN apk update \
+    && apk add zip
+
+WORKDIR /app
+
+COPY . ./
+
+RUN mvn package \
+    && unzip target/releases/elasticsearch-analysis-dynamic-synonym-*.zip -d target/extracted
+
+FROM docker.elastic.co/elasticsearch/elasticsearch:8.7.1
+
+COPY --from=build --chown=elasticsearch:elasticsearch /app/target/extracted /usr/share/elasticsearch/plugins/dynamic-synonym/

+ 18 - 0
Makefile

@@ -0,0 +1,18 @@
+image_host ?=
+
+image_tag ?= 1.0.0
+
+.PHONY: help build_image push_image build_image_and_push
+
+help: ## Display this help message.
+	@echo "Please use \`make <target>\` where <target> is one of"
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; \
+	{printf "\033[36m%-40s\033[0m %s\n", $$1, $$2}'
+
+build_image:  ## Build docker image. Use `image_host` to override the default image host and `image_tag` to do the same for image tag.
+	docker build -t $(image_host)dynamic-synonym-elasticsearch:$(image_tag) -t $(image_host)dynamic-synonym-elasticsearch:latest .
+
+push_image:  ## Push docker image. Use `image_host` to override the default image host.
+	docker push -a $(image_host)dynamic-synonym-elasticsearch
+
+build_image_and_push: build_image push_image  ## Build and push docker image. Use `image_host` to override the default image host and `image_tag` to do the same for image tag.

+ 79 - 0
README.md

@@ -0,0 +1,79 @@
+# Dynamic Synonym for ElasticSearch
+
+The dynamic synonym plugin adds a synonym token filter that reloads the synonym file (local file or remote file) at given intervals (default 60s).
+
+## Version
+
+| dynamic synonym version | ES version    |
+|-------------------------|---------------|
+| master                  | 8.x -> master |
+| 7.4.2                   | 7.4.2         |
+| 6.1.4                   | 6.1.4         |
+| 5.2.0                   | 5.2.0         |
+| 5.1.1                   | 5.1.1         |
+| 2.3.0                   | 2.3.0         |
+| 2.2.0                   | 2.2.0         |
+| 2.1.0                   | 2.1.0         |
+| 2.0.0                   | 2.0.0         |
+| 1.6.0                   | 1.6.X         |
+
+## Installation
+
+1. `mvn package`
+
+2. copy and unzip `target/releases/elasticsearch-analysis-dynamic-synonym-{version}.zip` to `your-es-root/plugins/dynamic-synonym`
+
+## Example
+
+```json
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "synonym" : {
+                    "tokenizer" : "whitespace",
+                    "filter" : ["remote_synonym"]
+                }
+            },
+            "filter" : {
+                "remote_synonym" : {
+                    "type" : "dynamic_synonym",
+                    "synonyms_path" : "http://host:port/synonym.txt",
+                    "interval": 30
+                },
+                "local_synonym" : {
+                    "type" : "dynamic_synonym",
+                    "synonyms_path" : "synonym.txt"
+                },
+                "synonym_graph" : {
+                    "type" : "dynamic_synonym_graph",
+                    "synonyms_path" : "http://host:port/synonym.txt"
+                }
+            }
+        }
+    }
+}
+```
+### Configuration
+
+`type`: `dynamic_synonym` or `dynamic_synonym_graph`, *mandatory*
+
+`synonyms_path`: A file path relative to the Elastic config file or an URL, *mandatory*
+
+`interval`: Refresh interval in seconds for the synonym file, default: `60`, *optional*
+
+`ignore_case`: Ignore case in synonyms file, default: `false`, *optional*
+
+`expand`: Expand, default: `true`, *optional* 
+
+`lenient`: Lenient on exception thrown when importing a synonym, default: `false`, *optional* 
+
+`format`: Synonym file format, default: `''`, *optional*. For WordNet structure this can be set to `'wordnet'`
+
+
+## Update mechanism
+
+* Local files: Determined by modification time of the file, if it has changed the synonyms wil
+* Remote files: Reads out the `Last-Modified` and `ETag` http header. If one of these changes, the synonyms will be reloaded. 
+
+**Note:** File encoding should be an utf-8 text file. 

+ 175 - 0
pom.xml

@@ -0,0 +1,175 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.bellszhu.elasticsearch</groupId>
+    <artifactId>elasticsearch-analysis-dynamic-synonym</artifactId>
+    <version>7.14.0</version>
+    <packaging>jar</packaging>
+    <name>elasticsearch-dynamic-synonym</name>
+    <description>Analysis-plugin for synonym</description>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <elasticsearch.version>${project.version}</elasticsearch.version>
+        <maven.compiler.target>16</maven.compiler.target>
+        <elasticsearch.plugin.name>analysis-dynamic-synonym</elasticsearch.plugin.name>
+        <elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml
+        </elasticsearch.assembly.descriptor>
+        <elasticsearch.plugin.classname>com.bellszhu.elasticsearch.plugin.DynamicSynonymPlugin
+        </elasticsearch.plugin.classname>
+        <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
+    </properties>
+
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <parent>
+        <groupId>org.sonatype.oss</groupId>
+        <artifactId>oss-parent</artifactId>
+        <version>9</version>
+    </parent>
+
+    <scm>
+        <connection>scm:git:git@github.com:bells/elasticsearch-analysis-dynamic-synonym.git</connection>
+        <developerConnection>scm:git:git@github.com:bells/elasticsearch-analysis-dynamic-synonym.git
+        </developerConnection>
+        <url>https://github.com/bells/elasticsearch-analysis-dynamic-synonym</url>
+    </scm>
+
+    <repositories>
+        <repository>
+            <id>central</id>
+            <url>https://repo1.maven.org/maven2</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+        <repository>
+            <id>codelibs.org</id>
+            <name>CodeLibs Repository</name>
+            <url>https://maven.codelibs.org/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.elasticsearch</groupId>
+            <artifactId>elasticsearch</artifactId>
+            <version>${elasticsearch.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.codelibs.elasticsearch.module</groupId>
+            <artifactId>analysis-common</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents.client5</groupId>
+            <artifactId>httpclient5</artifactId>
+            <version>5.2.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-core</artifactId>
+            <version>2.17.1</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-api</artifactId>
+            <version>2.20.0</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.codelibs</groupId>
+            <artifactId>elasticsearch-cluster-runner</artifactId>
+            <version>${project.version}.0</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.11.0</version>
+                <configuration>
+                    <source>${maven.compiler.target}</source>
+                    <target>${maven.compiler.target}</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>3.1.0</version>
+                <configuration>
+                    <includes>
+                        <include>**/*Tests.java</include>
+                    </includes>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-enforcer-plugin</artifactId>
+                <version>3.3.0</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.3.1</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>3.3.0</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <configuration>
+                    <appendAssemblyId>false</appendAssemblyId>
+                    <outputDirectory>${project.build.directory}/releases/</outputDirectory>
+                    <descriptors>
+                        <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
+                    </descriptors>
+                    <archive>
+                        <manifest>
+                            <mainClass>fully.qualified.MainClass</mainClass>
+                        </manifest>
+                    </archive>
+                </configuration>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>

+ 34 - 0
src/main/assemblies/plugin.xml

@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<assembly>
+    <id>-</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <files>
+        <file>
+            <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
+            <filtered>true</filtered>
+        </file>
+        <file>
+            <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
+            <filtered>true</filtered>
+        </file>
+    </files>
+    <dependencySets>
+        <dependencySet>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <excludes>
+                <exclude>org.elasticsearch:elasticsearch</exclude>
+            </excludes>
+        </dependencySet>
+        <dependencySet>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <includes>
+                <include>org.apache.httpcomponents:httpclient</include>
+            </includes>
+        </dependencySet>
+    </dependencySets>
+</assembly>

+ 29 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java

@@ -0,0 +1,29 @@
+package com.bellszhu.elasticsearch.plugin;
+
+import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymGraphTokenFilterFactory;
+import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymTokenFilterFactory;
+
+
+/**
+ * @author bellszhu
+ */
+public class DynamicSynonymPlugin extends Plugin implements AnalysisPlugin {
+
+    @Override
+    public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+        Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
+        extra.put("dynamic_synonym", requiresAnalysisSettings((indexSettings, env, name, settings) -> new DynamicSynonymTokenFilterFactory(indexSettings,env, name, settings)));
+        extra.put("dynamic_synonym_graph", requiresAnalysisSettings((indexSettings, env, name, settings) -> new DynamicSynonymGraphTokenFilterFactory(indexSettings,env, name, settings)));
+        return extra;
+    }
+}

+ 21 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/AbsSynonymFilter.java

@@ -0,0 +1,21 @@
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+
+/**
+ * @author bellszhu
+ */
+public abstract class AbsSynonymFilter extends TokenFilter {
+    /**
+     * Construct a token stream filtering the given input.
+     *
+     * @param input
+     */
+    protected AbsSynonymFilter(TokenStream input) {
+        super(input);
+    }
+
+    abstract void update(SynonymMap synonymMap);
+}

+ 632 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java

@@ -0,0 +1,632 @@
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * Matches single or multi word synonyms in a token stream. This token stream
+ * cannot properly handle position increments != 1, ie, you should place this
+ * filter before filtering out stop words.
+ *
+ * <p>
+ * Note that with the current implementation, parsing is greedy, so whenever
+ * multiple parses would apply, the rule starting the earliest and parsing the
+ * most tokens wins. For example if you have these rules:
+ *
+ * <pre>
+ *   a -> x
+ *   a b -> y
+ *   b c d -> z
+ * </pre>
+ * <p>
+ * Then input <code>a b c d e</code> parses to <code>y b c
+ * d</code>, ie the 2nd rule "wins" because it started earliest and matched the
+ * most input tokens of other rules starting at that point.
+ * </p>
+ *
+ * <p>
+ * A future improvement to this filter could allow non-greedy parsing, such that
+ * the 3rd rule would win, and also separately allow multiple parses, such that
+ * all 3 rules would match, perhaps even on a rule by rule basis.
+ * </p>
+ *
+ * <p>
+ * <b>NOTE</b>: when a match occurs, the output tokens associated with the
+ * matching rule are "stacked" on top of the input stream (if the rule had
+ * <code>keepOrig=true</code>) and also on top of another matched rule's output
+ * tokens. This is not a correct solution, as really the output should be an
+ * arbitrary graph/lattice. For example, with the above match, you would expect
+ * an exact <code>PhraseQuery</code> <code>"y b
+ * c"</code> to match the parsed tokens, but it will fail to do so. This
+ * limitation is necessary because Lucene's TokenStream (and index) cannot yet
+ * represent an arbitrary graph.
+ * </p>
+ *
+ * <p>
+ * <b>NOTE</b>: If multiple incoming tokens arrive on the same position, only
+ * the first token at that position is used for parsing. Subsequent tokens
+ * simply pass through and are not parsed. A future improvement would be to
+ * allow these tokens to also be matched.
+ * </p>
+ */
+
+// TODO: maybe we should resolve token -> wordID then run
+// FST on wordIDs, for better perf?
+
+// TODO: a more efficient approach would be Aho/Corasick's
+// algorithm
+// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
+// It improves over the current approach here
+// because it does not fully re-start matching at every
+// token. For example if one pattern is "a b c x"
+// and another is "b c d" and the input is "a b c d", on
+// trying to parse "a b c x" but failing when you got to x,
+// rather than starting over again your really should
+// immediately recognize that "b c d" matches at the next
+// input. I suspect this won't matter that much in
+// practice, but it's possible on some set of synonyms it
+// will. We'd have to modify Aho/Corasick to enforce our
+// conflict resolving (eg greedy matching) because that algo
+// finds all matches. This really amounts to adding a .*
+// closure to the FST and then determinizing it.
+//
+// Another possible solution is described at
+// http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+
+public final class DynamicSynonymFilter extends AbsSynonymFilter {
+
+    private static final String TYPE_SYNONYM = "SYNONYM";
+    private final boolean ignoreCase;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+    private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+
+    // TODO: we should set PositionLengthAttr too...
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
+    private final BytesRef scratchBytes = new BytesRef();
+    private final CharsRefBuilder scratchChars = new CharsRefBuilder();
+    private SynonymMap synonyms;
+    private int rollBufferSize;
+
+    private int captureCount;
+    // How many future input tokens have already been matched
+    // to a synonym; because the matching is "greedy" we don't
+    // try to do any more matching for such tokens:
+    private int inputSkipCount;
+
+    // Rolling buffer, holding pending input tokens we had to
+    // clone because we needed to look ahead, indexed by
+    // position:
+    private PendingInput[] futureInputs;
+    // Rolling buffer, holding stack of pending synonym
+    // outputs, indexed by position:
+    private PendingOutputs[] futureOutputs;
+
+    // Where (in rolling buffers) to write next input saved state:
+    private int nextWrite;
+
+    // Where (in rolling buffers) to read next input saved state:
+    private int nextRead;
+
+    // True once we've read last token
+    private boolean finished;
+
+    private FST.Arc<BytesRef> scratchArc;
+
+    private FST<BytesRef> fst;
+
+    private FST.BytesReader fstReader;
+    /*
+     * This is the core of this TokenFilter: it locates the synonym matches and
+     * buffers up the results into futureInputs/Outputs.
+     *
+     * NOTE: this calls input.incrementToken and does not capture the state if
+     * no further tokens were checked. So caller must then forward state to our
+     * caller, or capture:
+     */
+    private int lastStartOffset;
+    private int lastEndOffset;
+
+    /**
+     * @param input      input tokenstream
+     * @param synonyms   synonym map
+     * @param ignoreCase case-folds input for matching with
+     *                   {@link Character#toLowerCase(int)}. Note, if you set this to
+     *                   true, its your responsibility to lowercase the input entries
+     *                   when you create the {@link SynonymMap}
+     */
+    DynamicSynonymFilter(TokenStream input, SynonymMap synonyms,
+                         boolean ignoreCase) {
+        super(input);
+        this.ignoreCase = ignoreCase;
+        update(synonyms);
+    }
+
+    private void capture() {
+        captureCount++;
+        final PendingInput input = futureInputs[nextWrite];
+
+        input.state = captureState();
+        input.consumed = false;
+        input.term.copyChars(termAtt.buffer(), 0, termAtt.length());
+
+        nextWrite = rollIncr(nextWrite);
+
+        // Buffer head should never catch up to tail:
+        assert nextWrite != nextRead;
+    }
+
+    private void parse() throws IOException {
+
+        assert inputSkipCount == 0;
+
+        int curNextRead = nextRead;
+
+        // Holds the longest match we've seen so far:
+        BytesRef matchOutput = null;
+        int matchInputLength = 0;
+        int matchEndOffset = -1;
+
+        BytesRef pendingOutput = fst.outputs.getNoOutput();
+        fst.getFirstArc(scratchArc);
+
+        assert scratchArc.output() == fst.outputs.getNoOutput();
+
+        int tokenCount = 0;
+
+        byToken:
+        while (true) {
+
+            // Pull next token's chars:
+            final char[] buffer;
+            final int bufferLen;
+
+            int inputEndOffset = 0;
+
+            if (curNextRead == nextWrite) {
+
+                // We used up our lookahead buffer of input tokens
+                // -- pull next real input token:
+                if (finished) {
+                    break;
+                } else {
+                    assert futureInputs[nextWrite].consumed;
+                    // Not correct: a syn match whose output is longer
+                    // than its input can set future inputs keepOrig
+                    // to true:
+                    if (input.incrementToken()) {
+                        buffer = termAtt.buffer();
+                        bufferLen = termAtt.length();
+                        final PendingInput input = futureInputs[nextWrite];
+                        lastStartOffset = input.startOffset = offsetAtt
+                                .startOffset();
+                        lastEndOffset = input.endOffset = offsetAtt.endOffset();
+                        inputEndOffset = input.endOffset;
+                        if (nextRead != nextWrite) {
+                            capture();
+                        } else {
+                            input.consumed = false;
+                        }
+
+                    } else {
+                        // No more input tokens
+                        finished = true;
+                        break;
+                    }
+                }
+            } else {
+                // Still in our lookahead
+                buffer = futureInputs[curNextRead].term.chars();
+                bufferLen = futureInputs[curNextRead].term.length();
+                inputEndOffset = futureInputs[curNextRead].endOffset;
+            }
+
+            tokenCount++;
+
+            // Run each char in this token through the FST:
+            int bufUpto = 0;
+            while (bufUpto < bufferLen) {
+                final int codePoint = Character.codePointAt(buffer, bufUpto,
+                        bufferLen);
+                if (fst.findTargetArc(
+                        ignoreCase ? Character.toLowerCase(codePoint)
+                                : codePoint, scratchArc, scratchArc, fstReader) == null) {
+                    break byToken;
+                }
+
+                // Accum the output
+                pendingOutput = fst.outputs.add(pendingOutput,
+                        scratchArc.output());
+                bufUpto += Character.charCount(codePoint);
+            }
+
+            // OK, entire token matched; now see if this is a final
+            // state:
+            if (scratchArc.isFinal()) {
+                matchOutput = fst.outputs.add(pendingOutput,
+                        scratchArc.nextFinalOutput());
+                matchInputLength = tokenCount;
+                matchEndOffset = inputEndOffset;
+            }
+
+            // See if the FST wants to continue matching (ie, needs to
+            // see the next input token):
+            if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc,
+                    scratchArc, fstReader) == null) {
+                // No further rules can match here; we're done
+                // searching for matching rules starting at the
+                // current input position.
+                break;
+            } else {
+                // More matching is possible -- accum the output (if
+                // any) of the WORD_SEP arc:
+                pendingOutput = fst.outputs.add(pendingOutput,
+                        scratchArc.output());
+                if (nextRead == nextWrite) {
+                    capture();
+                }
+            }
+
+            curNextRead = rollIncr(curNextRead);
+        }
+
+        if (nextRead == nextWrite && !finished) {
+            nextWrite = rollIncr(nextWrite);
+        }
+
+        if (matchOutput != null) {
+            inputSkipCount = matchInputLength;
+            addOutput(matchOutput, matchInputLength, matchEndOffset);
+        } else if (nextRead != nextWrite) {
+            // Even though we had no match here, we set to 1
+            // because we need to skip current input token before
+            // trying to match again:
+            inputSkipCount = 1;
+        } else {
+            assert finished;
+        }
+
+    }
+
+    // Interleaves all output tokens onto the futureOutputs:
+    private void addOutput(BytesRef bytes, int matchInputLength,
+                           int matchEndOffset) {
+        bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+
+        final int code = bytesReader.readVInt();
+        final boolean keepOrig = (code & 0x1) == 0;
+        final int count = code >>> 1;
+        for (int outputIDX = 0; outputIDX < count; outputIDX++) {
+            synonyms.words.get(bytesReader.readVInt(), scratchBytes);
+            scratchChars.copyUTF8Bytes(scratchBytes);
+            int lastStart = 0;
+            final int chEnd = lastStart + scratchChars.length();
+            int outputUpto = nextRead;
+            for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
+                if (chIDX == chEnd
+                        || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
+                    final int outputLen = chIDX - lastStart;
+                    // Caller is not allowed to have empty string in
+                    // the output:
+                    assert outputLen > 0 : "output contains empty string: "
+                            + scratchChars;
+                    final int endOffset;
+                    final int posLen;
+                    if (chIDX == chEnd && lastStart == 0) {
+                        // This rule had a single output token, so, we set
+                        // this output's endOffset to the current
+                        // endOffset (ie, endOffset of the last input
+                        // token it matched):
+                        endOffset = matchEndOffset;
+                        posLen = keepOrig ? matchInputLength : 1;
+                    } else {
+                        // This rule has more than one output token; we
+                        // can't pick any particular endOffset for this
+                        // case, so, we inherit the endOffset for the
+                        // input token which this output overlaps:
+                        endOffset = -1;
+                        posLen = 1;
+                    }
+                    futureOutputs[outputUpto].add(scratchChars.chars(),
+                            lastStart, outputLen, endOffset, posLen);
+                    lastStart = 1 + chIDX;
+                    outputUpto = rollIncr(outputUpto);
+                    assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto="
+                            + outputUpto + " vs nextWrite=" + nextWrite;
+                }
+            }
+        }
+
+        int upto = nextRead;
+        for (int idx = 0; idx < matchInputLength; idx++) {
+            futureInputs[upto].keepOrig |= keepOrig;
+            futureInputs[upto].matched = true;
+            upto = rollIncr(upto);
+        }
+    }
+
+    // ++ mod rollBufferSize
+    private int rollIncr(int count) {
+        count++;
+        if (count == rollBufferSize) {
+            return 0;
+        } else {
+            return count;
+        }
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+
+        while (true) {
+
+            // First play back any buffered future inputs/outputs
+            // w/o running parsing again:
+            while (inputSkipCount != 0) {
+
+                // At each position, we first output the original
+                // token
+
+                // TODO: maybe just a PendingState class, holding
+                // both input & outputs?
+                final PendingInput input = futureInputs[nextRead];
+                final PendingOutputs outputs = futureOutputs[nextRead];
+
+                if (!input.consumed && (input.keepOrig || !input.matched)) {
+                    if (input.state != null) {
+                        // Return a previously saved token (because we
+                        // had to lookahead):
+                        restoreState(input.state);
+                    } else {
+                        // Pass-through case: return token we just pulled
+                        // but didn't capture:
+                        assert inputSkipCount == 1 : "inputSkipCount="
+                                + inputSkipCount + " nextRead=" + nextRead;
+                    }
+                    input.reset();
+                    if (outputs.count > 0) {
+                        outputs.posIncr = 0;
+                    } else {
+                        nextRead = rollIncr(nextRead);
+                        inputSkipCount--;
+                    }
+                    return true;
+                } else if (outputs.upto < outputs.count) {
+                    // Still have pending outputs to replay at this
+                    // position
+                    input.reset();
+                    final int posIncr = outputs.posIncr;
+                    final CharsRef output = outputs.pullNext();
+                    clearAttributes();
+                    termAtt.copyBuffer(output.chars, output.offset,
+                            output.length);
+                    typeAtt.setType(TYPE_SYNONYM);
+                    int endOffset = outputs.getLastEndOffset();
+                    if (endOffset == -1) {
+                        endOffset = input.endOffset;
+                    }
+                    offsetAtt.setOffset(input.startOffset, endOffset);
+                    posIncrAtt.setPositionIncrement(posIncr);
+                    posLenAtt.setPositionLength(outputs.getLastPosLength());
+                    if (outputs.count == 0) {
+                        // Done with the buffered input and all outputs at
+                        // this position
+                        nextRead = rollIncr(nextRead);
+                        inputSkipCount--;
+                    }
+                    return true;
+                } else {
+                    // Done with the buffered input and all outputs at
+                    // this position
+                    input.reset();
+                    nextRead = rollIncr(nextRead);
+                    inputSkipCount--;
+                }
+            }
+
+            if (finished && nextRead == nextWrite) {
+                // End case: if any output syns went beyond end of
+                // input stream, enumerate them now:
+                final PendingOutputs outputs = futureOutputs[nextRead];
+                if (outputs.upto < outputs.count) {
+                    final int posIncr = outputs.posIncr;
+                    final CharsRef output = outputs.pullNext();
+                    futureInputs[nextRead].reset();
+                    if (outputs.count == 0) {
+                        nextWrite = nextRead = rollIncr(nextRead);
+                    }
+                    clearAttributes();
+                    // Keep offset from last input token:
+                    offsetAtt.setOffset(lastStartOffset, lastEndOffset);
+                    termAtt.copyBuffer(output.chars, output.offset,
+                            output.length);
+                    typeAtt.setType(TYPE_SYNONYM);
+                    posIncrAtt.setPositionIncrement(posIncr);
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+
+            // Find new synonym matches:
+            parse();
+        }
+    }
+
+    @Override
+    public void reset() throws IOException {
+
+        super.reset();
+        captureCount = 0;
+        finished = false;
+        inputSkipCount = 0;
+        nextRead = nextWrite = 0;
+
+        // In normal usage these resets would not be needed,
+        // since they reset-as-they-are-consumed, but the app
+        // may not consume all input tokens (or we might hit an
+        // exception), in which case we have leftover state
+        // here:
+        for (PendingInput input : futureInputs) {
+            input.reset();
+        }
+        for (PendingOutputs output : futureOutputs) {
+            output.reset();
+        }
+    }
+
+    void update(SynonymMap synonymMap) {
+        this.synonyms = synonymMap;
+        this.fst = synonyms.fst;
+        if (fst == null) {
+            throw new IllegalArgumentException("fst must be non-null");
+        }
+        this.fstReader = fst.getBytesReader();
+
+        // Must be 1+ so that when roll buffer is at full
+        // lookahead we can distinguish this full buffer from
+        // the empty buffer:
+        rollBufferSize = 1 + synonyms.maxHorizontalContext;
+
+        futureInputs = new PendingInput[rollBufferSize];
+        futureOutputs = new PendingOutputs[rollBufferSize];
+        for (int pos = 0; pos < rollBufferSize; pos++) {
+            futureInputs[pos] = new PendingInput();
+            futureOutputs[pos] = new PendingOutputs();
+        }
+
+        scratchArc = new FST.Arc<>();
+    }
+
+    // Hold all buffered (read ahead) stacked input tokens for
+    // a future position. When multiple tokens are at the
+    // same position, we only store (and match against) the
+    // term for the first token at the position, but capture
+    // state for (and enumerate) all other tokens at this
+    // position:
+    private static class PendingInput {
+        final CharsRefBuilder term = new CharsRefBuilder();
+        AttributeSource.State state;
+        boolean keepOrig;
+        boolean matched;
+        boolean consumed = true;
+        int startOffset;
+        int endOffset;
+
+        void reset() {
+            state = null;
+            consumed = true;
+            keepOrig = false;
+            matched = false;
+        }
+    }
+
+    // Holds pending output synonyms for one future position:
+    private static class PendingOutputs {
+        CharsRefBuilder[] outputs;
+        int[] endOffsets;
+        int[] posLengths;
+        int upto;
+        int count;
+        int posIncr = 1;
+        int lastEndOffset;
+        int lastPosLength;
+
+        PendingOutputs() {
+            outputs = new CharsRefBuilder[1];
+            endOffsets = new int[1];
+            posLengths = new int[1];
+        }
+
+        void reset() {
+            upto = count = 0;
+            posIncr = 1;
+        }
+
+        CharsRef pullNext() {
+            assert upto < count;
+            lastEndOffset = endOffsets[upto];
+            lastPosLength = posLengths[upto];
+            final CharsRefBuilder result = outputs[upto++];
+            posIncr = 0;
+            if (upto == count) {
+                reset();
+            }
+            return result.get();
+        }
+
+        int getLastEndOffset() {
+            return lastEndOffset;
+        }
+
+        int getLastPosLength() {
+            return lastPosLength;
+        }
+
+        void add(char[] output, int offset, int len, int endOffset,
+                 int posLength) {
+            if (count == outputs.length) {
+                outputs = Arrays.copyOf(outputs, ArrayUtil.oversize(1 + count,
+                        RamUsageEstimator.NUM_BYTES_OBJECT_REF));
+            }
+            if (count == endOffsets.length) {
+                final int[] next = new int[ArrayUtil.oversize(1 + count,
+                                                              Integer.BYTES)];
+                System.arraycopy(endOffsets, 0, next, 0, count);
+                endOffsets = next;
+            }
+            if (count == posLengths.length) {
+                final int[] next = new int[ArrayUtil.oversize(1 + count,
+                                                              Integer.BYTES)];
+                System.arraycopy(posLengths, 0, next, 0, count);
+                posLengths = next;
+            }
+            if (outputs[count] == null) {
+                outputs[count] = new CharsRefBuilder();
+            }
+            outputs[count].copyChars(output, offset, len);
+            // endOffset can be -1, in which case we should simply
+            // use the endOffset of the input token, or X >= 0, in
+            // which case we use X as the endOffset for this output
+            endOffsets[count] = endOffset;
+            posLengths[count] = posLength;
+            count++;
+        }
+    }
+
+}

+ 600 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java

@@ -0,0 +1,600 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.RollingBuffer;
+import org.apache.lucene.util.fst.FST;
+
+// TODO: maybe we should resolve token -> wordID then run
+// FST on wordIDs, for better perf?
+ 
+// TODO: a more efficient approach would be Aho/Corasick's
+// algorithm
+// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
+// It improves over the current approach here
+// because it does not fully re-start matching at every
+// token.  For example if one pattern is "a b c x"
+// and another is "b c d" and the input is "a b c d", on
+// trying to parse "a b c x" but failing when you got to x,
+// rather than starting over again your really should
+// immediately recognize that "b c d" matches at the next
+// input.  I suspect this won't matter that much in
+// practice, but it's possible on some set of synonyms it
+// will.  We'd have to modify Aho/Corasick to enforce our
+// conflict resolving (eg greedy matching) because that algo
+// finds all matches.  This really amounts to adding a .*
+// closure to the FST and then determinizing it.
+//
+// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+
+/** Applies single- or multi-token synonyms from a {@link SynonymMap}
+ *  to an incoming {@link TokenStream}, producing a fully correct graph
+ *  output.  This is a replacement for {@link SynonymFilter}, which produces
+ *  incorrect graphs for multi-token synonyms.
+ *
+ *  <p>However, if you use this during indexing, you must follow it with
+ *  {@link FlattenGraphFilter} to squash tokens on top of one another
+ *  like {@link SynonymFilter}, because the indexer can't directly
+ *  consume a graph.  To get fully correct positional queries when your
+ *  synonym replacements are multiple tokens, you should instead apply
+ *  synonyms using this {@code TokenFilter} at query time and translate
+ *  the resulting graph to a {@code TermAutomatonQuery} e.g. using
+ *  {@code TokenStreamToTermAutomatonQuery}.
+ *
+ *  <p><b>NOTE</b>: this cannot consume an incoming graph; results will
+ *  be undefined.
+ *
+ *  @lucene.experimental */
+
+public final class DynamicSynonymGraphFilter extends AbsSynonymFilter {
+
+  public static final String TYPE_SYNONYM = "SYNONYM";
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private SynonymMap synonyms;
+  private final boolean ignoreCase;
+
+  private FST<BytesRef> fst;
+
+  private FST.BytesReader fstReader;
+  private FST.Arc<BytesRef> scratchArc;
+  private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
+  private final BytesRef scratchBytes = new BytesRef();
+  private final CharsRefBuilder scratchChars = new CharsRefBuilder();
+  private final LinkedList<BufferedOutputToken> outputBuffer = new LinkedList<>();
+
+  private int nextNodeOut;
+  private int lastNodeOut;
+  private int maxLookaheadUsed;
+
+  // For testing:
+  private int captureCount;
+
+  private boolean liveToken;
+
+  // Start/end offset of the current match:
+  private int matchStartOffset;
+  private int matchEndOffset;
+
+  // True once the input TokenStream is exhausted:
+  private boolean finished;
+
+  private int lookaheadNextRead;
+  private int lookaheadNextWrite;
+
+  private RollingBuffer<BufferedInputToken> lookahead = new RollingBuffer<BufferedInputToken>() {
+    @Override
+    protected BufferedInputToken newInstance() {
+      return new BufferedInputToken();
+    }
+  };
+
+  static class BufferedInputToken implements RollingBuffer.Resettable {
+    final CharsRefBuilder term = new CharsRefBuilder();
+    State state;
+    int startOffset = -1;
+    int endOffset = -1;
+
+    @Override
+    public void reset() {
+      state = null;
+      term.clear();
+
+      // Intentionally invalid to ferret out bugs:
+      startOffset = -1;
+      endOffset = -1;
+    }
+  }
+
+  static class BufferedOutputToken {
+    final String term;
+
+    // Non-null if this was an incoming token:
+    final State state;
+
+    final int startNode;
+    final int endNode;
+
+    public BufferedOutputToken(State state, String term, int startNode, int endNode) {
+      this.state = state;
+      this.term = term;
+      this.startNode = startNode;
+      this.endNode = endNode;
+    }
+  }
+
+  /**
+   * Apply previously built synonyms to incoming tokens.
+   * @param input input tokenstream
+   * @param synonyms synonym map
+   * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
+   *                   Note, if you set this to true, it's your responsibility to lowercase
+   *                   the input entries when you create the {@link SynonymMap}
+   */
+  public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
+    super(input);
+    update(synonyms);
+    this.ignoreCase = ignoreCase;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut);
+
+    assert lastNodeOut <= nextNodeOut;
+      
+    if (outputBuffer.isEmpty() == false) {
+      // We still have pending outputs from a prior synonym match:
+      releaseBufferedToken();
+      //System.out.println("  syn: ret buffered=" + this);
+      assert liveToken == false;
+      return true;
+    }
+
+    // Try to parse a new synonym match at the current token:
+
+    if (parse()) {
+      // A new match was found:
+      releaseBufferedToken();
+      //System.out.println("  syn: after parse, ret buffered=" + this);
+      assert liveToken == false;
+      return true;
+    }
+
+    if (lookaheadNextRead == lookaheadNextWrite) {
+
+      // Fast path: parse pulled one token, but it didn't match
+      // the start for any synonym, so we now return it "live" w/o having
+      // cloned all of its atts:
+      if (finished) {
+        //System.out.println("  syn: ret END");
+        return false;
+      }
+
+      assert liveToken;
+      liveToken = false;
+
+      // NOTE: no need to change posInc since it's relative, i.e. whatever
+      // node our output is upto will just increase by the incoming posInc.
+      // We also don't need to change posLen, but only because we cannot
+      // consume a graph, so the incoming token can never span a future
+      // synonym match.
+
+    } else {
+      // We still have buffered lookahead tokens from a previous
+      // parse attempt that required lookahead; just replay them now:
+      //System.out.println("  restore buffer");
+      assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite;
+      BufferedInputToken token = lookahead.get(lookaheadNextRead);
+      lookaheadNextRead++;
+
+      restoreState(token.state);
+
+      lookahead.freeBefore(lookaheadNextRead);
+
+      //System.out.println("  after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset());
+      assert liveToken == false;
+    }
+
+    lastNodeOut += posIncrAtt.getPositionIncrement();
+    nextNodeOut = lastNodeOut + posLenAtt.getPositionLength();
+
+    //System.out.println("  syn: ret lookahead=" + this);
+
+    return true;
+  }
+
+  private void releaseBufferedToken() throws IOException {
+    //System.out.println("  releaseBufferedToken");
+
+    BufferedOutputToken token = outputBuffer.pollFirst();
+
+    if (token.state != null) {
+      // This is an original input token (keepOrig=true case):
+      //System.out.println("    hasState");
+      restoreState(token.state);
+      //System.out.println("    startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset());
+    } else {
+      clearAttributes();
+      //System.out.println("    no state");
+      termAtt.append(token.term);
+
+      // We better have a match already:
+      assert matchStartOffset != -1;
+
+      offsetAtt.setOffset(matchStartOffset, matchEndOffset);
+      //System.out.println("    startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset);
+      typeAtt.setType(TYPE_SYNONYM);
+    }
+
+    //System.out.println("    lastNodeOut=" + lastNodeOut);
+    //System.out.println("    term=" + termAtt);
+
+    posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut);
+    lastNodeOut = token.startNode;
+    posLenAtt.setPositionLength(token.endNode - token.startNode);
+  }
+
+  /** Scans the next input token(s) to see if a synonym matches.  Returns true
+   *  if a match was found. */
+  private boolean parse() throws IOException {
+    // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this));
+
+    // Holds the longest match we've seen so far:
+    BytesRef matchOutput = null;
+    int matchInputLength = 0;
+
+    BytesRef pendingOutput = fst.outputs.getNoOutput();
+    fst.getFirstArc(scratchArc);
+
+    assert scratchArc.output() == fst.outputs.getNoOutput();
+
+    // How many tokens in the current match
+    int matchLength = 0;
+    boolean doFinalCapture = false;
+
+    int lookaheadUpto = lookaheadNextRead;
+    matchStartOffset = -1;
+
+    byToken:
+    while (true) {
+      //System.out.println("  cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos());
+      
+      // Pull next token's chars:
+      final char[] buffer;
+      final int bufferLen;
+      final int inputEndOffset;
+
+      if (lookaheadUpto <= lookahead.getMaxPos()) {
+        // Still in our lookahead buffer
+        BufferedInputToken token = lookahead.get(lookaheadUpto);
+        lookaheadUpto++;
+        buffer = token.term.chars();
+        bufferLen = token.term.length();
+        inputEndOffset = token.endOffset;
+        //System.out.println("    use buffer now max=" + lookahead.getMaxPos());
+        if (matchStartOffset == -1) {
+          matchStartOffset = token.startOffset;
+        }
+      } else {
+
+        // We used up our lookahead buffer of input tokens
+        // -- pull next real input token:
+
+        assert finished || liveToken == false;
+
+        if (finished) {
+          //System.out.println("    break: finished");
+          break;
+        } else if (input.incrementToken()) {
+          //System.out.println("    input.incrToken");
+          liveToken = true;
+          buffer = termAtt.buffer();
+          bufferLen = termAtt.length();
+          if (matchStartOffset == -1) {
+            matchStartOffset = offsetAtt.startOffset();
+          }
+          inputEndOffset = offsetAtt.endOffset();
+
+          lookaheadUpto++;
+        } else {
+          // No more input tokens
+          finished = true;
+          //System.out.println("    break: now set finished");
+          break;
+        }
+      }
+
+      matchLength++;
+      //System.out.println("    cycle term=" + new String(buffer, 0, bufferLen));
+
+      // Run each char in this token through the FST:
+      int bufUpto = 0;
+      while (bufUpto < bufferLen) {
+        final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+        if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
+          break byToken;
+        }
+
+        // Accum the output
+        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
+        bufUpto += Character.charCount(codePoint);
+      }
+
+      assert bufUpto == bufferLen;
+
+      // OK, entire token matched; now see if this is a final
+      // state in the FST (a match):
+      if (scratchArc.isFinal()) {
+        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
+        matchInputLength = matchLength;
+        matchEndOffset = inputEndOffset;
+        //System.out.println("    ** match");
+      }
+
+      // See if the FST can continue matching (ie, needs to
+      // see the next input token):
+      if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
+        // No further rules can match here; we're done
+        // searching for matching rules starting at the
+        // current input position.
+        break;
+      } else {
+        // More matching is possible -- accum the output (if
+        // any) of the WORD_SEP arc:
+        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
+        doFinalCapture = true;
+        if (liveToken) {
+          capture();
+        }
+      }
+    }
+
+    if (doFinalCapture && liveToken && finished == false) {
+      // Must capture the final token if we captured any prior tokens:
+      capture();
+    }
+
+    if (matchOutput != null) {
+
+      if (liveToken) {
+        // Single input token synonym; we must buffer it now:
+        capture();
+      }
+
+      // There is a match!
+      bufferOutputTokens(matchOutput, matchInputLength);
+      lookaheadNextRead += matchInputLength;
+      //System.out.println("  precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
+      lookahead.freeBefore(lookaheadNextRead);
+      //System.out.println("  match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
+      return true;
+    } else {
+      //System.out.println("  no match; lookaheadNextRead=" + lookaheadNextRead);
+      return false;
+    }
+
+    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+  }
+
+  /** Expands the output graph into the necessary tokens, adding
+   *  synonyms as side paths parallel to the input tokens, and
+   *  buffers them in the output token buffer. */
+  private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
+    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+
+    final int code = bytesReader.readVInt();
+    final boolean keepOrig = (code & 0x1) == 0;
+    //System.out.println("  buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength);
+
+    // How many nodes along all paths; we need this to assign the
+    // node ID for the final end node where all paths merge back:
+    int totalPathNodes;
+    if (keepOrig) {
+      assert matchInputLength > 0;
+      totalPathNodes = matchInputLength - 1;
+    } else {
+      totalPathNodes = 0;
+    }
+
+    // How many synonyms we will insert over this match:
+    final int count = code >>> 1;
+
+    // TODO: we could encode this instead into the FST:
+
+    // 1st pass: count how many new nodes we need
+    List<List<String>> paths = new ArrayList<>();
+    for(int outputIDX=0;outputIDX<count;outputIDX++) {
+      int wordID = bytesReader.readVInt();
+      synonyms.words.get(wordID, scratchBytes);
+      scratchChars.copyUTF8Bytes(scratchBytes);
+      int lastStart = 0;
+
+      List<String> path = new ArrayList<>();
+      paths.add(path);
+      int chEnd = scratchChars.length();
+      for(int chUpto=0; chUpto<=chEnd; chUpto++) {
+        if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) {
+          path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart));
+          lastStart = 1 + chUpto;
+        }
+      }
+
+      assert path.size() > 0;
+      totalPathNodes += path.size() - 1;
+    }
+    //System.out.println("  totalPathNodes=" + totalPathNodes);
+
+    // 2nd pass: buffer tokens for the graph fragment
+
+    // NOTE: totalPathNodes will be 0 in the case where the matched
+    // input is a single token and all outputs are also a single token
+
+    // We "spawn" a side-path for each of the outputs for this matched
+    // synonym, all ending back at this end node:
+
+    int startNode = nextNodeOut;
+
+    int endNode = startNode + totalPathNodes + 1;
+    //System.out.println("  " + paths.size() + " new side-paths");
+
+    // First, fanout all tokens departing start node for these new side paths:
+    int newNodeCount = 0;
+    for(List<String> path : paths) {
+      int pathEndNode;
+      //System.out.println("    path size=" + path.size());
+      if (path.size() == 1) {
+        // Single token output, so there are no intermediate nodes:
+        pathEndNode = endNode;
+      } else {
+        pathEndNode = nextNodeOut + newNodeCount + 1;
+        newNodeCount += path.size() - 1;
+      }
+      outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode));
+    }
+
+    // We must do the original tokens last, else the offsets "go backwards":
+    if (keepOrig) {
+      BufferedInputToken token = lookahead.get(lookaheadNextRead);
+      int inputEndNode;
+      if (matchInputLength == 1) {
+        // Single token matched input, so there are no intermediate nodes:
+        inputEndNode = endNode;
+      } else {
+        inputEndNode = nextNodeOut + newNodeCount + 1;
+      }
+
+      //System.out.println("    keepOrig first token: " + token.term);
+
+      outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode));
+    }
+
+    nextNodeOut = endNode;
+
+    // Do full side-path for each syn output:
+    for(int pathID=0;pathID<paths.size();pathID++) {
+      List<String> path = paths.get(pathID);
+      if (path.size() > 1) {
+        int lastNode = outputBuffer.get(pathID).endNode;
+        for(int i=1;i<path.size()-1;i++) {
+          outputBuffer.add(new BufferedOutputToken(null, path.get(i), lastNode, lastNode+1));
+          lastNode++;
+        }
+        outputBuffer.add(new BufferedOutputToken(null, path.get(path.size()-1), lastNode, endNode));
+      }
+    }
+
+    if (keepOrig && matchInputLength > 1) {
+      // Do full "side path" with the original tokens:
+      int lastNode = outputBuffer.get(paths.size()).endNode;
+      for(int i=1;i<matchInputLength-1;i++) {
+        BufferedInputToken token = lookahead.get(lookaheadNextRead + i);
+        outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, lastNode+1));
+        lastNode++;
+      }
+      BufferedInputToken token = lookahead.get(lookaheadNextRead + matchInputLength - 1);
+      outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, endNode));
+    }
+
+    /*
+    System.out.println("  after buffer: " + outputBuffer.size() + " tokens:");
+    for(BufferedOutputToken token : outputBuffer) {
+      System.out.println("    tok: " + token.term + " startNode=" + token.startNode + " endNode=" + token.endNode);
+    }
+    */
+  }
+
+  /** Buffers the current input token into lookahead buffer. */
+  private void capture() {
+    assert liveToken;
+    liveToken = false;
+    BufferedInputToken token = lookahead.get(lookaheadNextWrite);
+    lookaheadNextWrite++;
+
+    token.state = captureState();
+    token.startOffset = offsetAtt.startOffset();
+    token.endOffset = offsetAtt.endOffset();
+    assert token.term.length() == 0;
+    token.term.append(termAtt);
+
+    captureCount++;
+    maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize());
+    //System.out.println("  maxLookaheadUsed=" + maxLookaheadUsed);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    lookahead.reset();
+    lookaheadNextWrite = 0;
+    lookaheadNextRead = 0;
+    captureCount = 0;
+    lastNodeOut = -1;
+    nextNodeOut = 0;
+    matchStartOffset = -1;
+    matchEndOffset = -1;
+    finished = false;
+    liveToken = false;
+    outputBuffer.clear();
+    maxLookaheadUsed = 0;
+    //System.out.println("S: reset");
+  }
+
+  void update(SynonymMap synonymMap) {
+    this.synonyms = synonymMap;
+    this.fst = synonyms.fst;
+    if (fst == null) {
+      throw new IllegalArgumentException("fst must be non-null");
+    }
+    this.fstReader = fst.getBytesReader();
+    scratchArc = new FST.Arc<>();
+
+  }
+
+  // for testing
+  int getCaptureCount() {
+    return captureCount;
+  }
+
+  // for testing
+  int getMaxLookaheadUsed() {
+    return maxLookaheadUsed;
+  }
+}

+ 66 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java

@@ -0,0 +1,66 @@
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AnalysisMode;
+import org.elasticsearch.index.analysis.CharFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+
+public class DynamicSynonymGraphTokenFilterFactory extends DynamicSynonymTokenFilterFactory {
+
+    public DynamicSynonymGraphTokenFilterFactory(
+            IndexSettings indexSettings, Environment env, String name, Settings settings
+    ) throws IOException {
+        super(indexSettings,env, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        throw new IllegalStateException(
+                "Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first"
+        );
+    }
+
+    @Override
+    public TokenFilterFactory getChainAwareTokenFilterFactory(
+            TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
+            List<TokenFilterFactory> previousTokenFilters,
+            Function<String, TokenFilterFactory> allFilters
+    ) {
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        synonymMap = buildSynonyms(analyzer);
+        final String name = name();
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                // fst is null means no synonyms
+                if (synonymMap.fst == null) {
+                    return tokenStream;
+                }
+                DynamicSynonymGraphFilter dynamicSynonymGraphFilter = new DynamicSynonymGraphFilter(
+                        tokenStream, synonymMap, false);
+                dynamicSynonymFilters.put(dynamicSynonymGraphFilter, 1);
+
+                return dynamicSynonymGraphFilter;
+            }
+
+            @Override
+            public AnalysisMode getAnalysisMode() {
+                return analysisMode;
+            }
+        };
+    }
+}

+ 207 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java

@@ -0,0 +1,207 @@
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.WeakHashMap;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Function;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.AnalysisMode;
+import org.elasticsearch.index.analysis.CharFilterFactory;
+import org.elasticsearch.index.analysis.CustomAnalyzer;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+
+/**
+ * @author bellszhu
+ */
+public class DynamicSynonymTokenFilterFactory extends
+        AbstractTokenFilterFactory {
+
+    private static final Logger logger = LogManager.getLogger("dynamic-synonym");
+
+    /**
+     * Static id generator
+     */
+    private static final AtomicInteger id = new AtomicInteger(1);
+    private static final ScheduledExecutorService pool = Executors.newScheduledThreadPool(1, r -> {
+        Thread thread = new Thread(r);
+        thread.setName("monitor-synonym-Thread-" + id.getAndAdd(1));
+        return thread;
+    });
+    private volatile ScheduledFuture<?> scheduledFuture;
+
+    private final String location;
+    private final boolean expand;
+    private final boolean lenient;
+    private final String format;
+    private final int interval;
+    protected SynonymMap synonymMap;
+    protected Map<AbsSynonymFilter, Integer> dynamicSynonymFilters = new WeakHashMap<>();
+    protected final Environment environment;
+    protected final AnalysisMode analysisMode;
+
+    public DynamicSynonymTokenFilterFactory(
+            IndexSettings indexSettings,
+            Environment env,
+            String name,
+            Settings settings
+    ) throws IOException {
+        super(indexSettings,name, settings);
+
+        this.location = settings.get("synonyms_path");
+        if (this.location == null) {
+            throw new IllegalArgumentException(
+                    "dynamic synonym requires `synonyms_path` to be configured");
+        }
+        if (settings.get("ignore_case") != null) {
+        }
+
+        this.interval = settings.getAsInt("interval", 60);
+        this.expand = settings.getAsBoolean("expand", true);
+        this.lenient = settings.getAsBoolean("lenient", false);
+        this.format = settings.get("format", "");
+        boolean updateable = settings.getAsBoolean("updateable", false);
+        this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
+        this.environment = env;
+    }
+
+    @Override
+    public AnalysisMode getAnalysisMode() {
+        return this.analysisMode;
+    }
+
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        throw new IllegalStateException(
+                "Call getChainAwareTokenFilterFactory to specialize this factory for an analysis chain first");
+    }
+
+    public TokenFilterFactory getChainAwareTokenFilterFactory(
+            TokenizerFactory tokenizer,
+            List<CharFilterFactory> charFilters,
+            List<TokenFilterFactory> previousTokenFilters,
+            Function<String, TokenFilterFactory> allFilters
+    ) {
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        synonymMap = buildSynonyms(analyzer);
+        final String name = name();
+        return new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return name;
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                // fst is null means no synonyms
+                if (synonymMap.fst == null) {
+                    return tokenStream;
+                }
+                DynamicSynonymFilter dynamicSynonymFilter = new DynamicSynonymFilter(tokenStream, synonymMap, false);
+                dynamicSynonymFilters.put(dynamicSynonymFilter, 1);
+
+                return dynamicSynonymFilter;
+            }
+
+            @Override
+            public TokenFilterFactory getSynonymFilter() {
+                // In order to allow chained synonym filters, we return IDENTITY here to
+                // ensure that synonyms don't get applied to the synonym map itself,
+                // which doesn't support stacked input tokens
+                return IDENTITY_FILTER;
+            }
+
+            @Override
+            public AnalysisMode getAnalysisMode() {
+                return analysisMode;
+            }
+        };
+    }
+
+    Analyzer buildSynonymAnalyzer(
+            TokenizerFactory tokenizer,
+            List<CharFilterFactory> charFilters,
+            List<TokenFilterFactory> tokenFilters
+    ) {
+        return new CustomAnalyzer(
+                tokenizer,
+                charFilters.toArray(new CharFilterFactory[0]),
+                tokenFilters.stream().map(TokenFilterFactory::getSynonymFilter).toArray(TokenFilterFactory[]::new)
+        );
+    }
+
+    SynonymMap buildSynonyms(Analyzer analyzer) {
+        try {
+            return getSynonymFile(analyzer).reloadSynonymMap();
+        } catch (Exception e) {
+            logger.error("failed to build synonyms", e);
+            throw new IllegalArgumentException("failed to build synonyms", e);
+        }
+    }
+
+    SynonymFile getSynonymFile(Analyzer analyzer) {
+        try {
+            SynonymFile synonymFile;
+            if (location.startsWith("http://") || location.startsWith("https://")) {
+                synonymFile = new RemoteSynonymFile(
+                        environment, analyzer, expand, lenient,  format, location);
+            } else {
+                synonymFile = new LocalSynonymFile(
+                        environment, analyzer, expand, lenient, format, location);
+            }
+            if (scheduledFuture == null) {
+                scheduledFuture = pool.scheduleAtFixedRate(new Monitor(synonymFile),
+                                interval, interval, TimeUnit.SECONDS);
+            }
+            return synonymFile;
+        } catch (Exception e) {
+            logger.error("failed to get synonyms: " + location, e);
+            throw new IllegalArgumentException("failed to get synonyms : " + location, e);
+        }
+    }
+
+    public class Monitor implements Runnable {
+
+        private SynonymFile synonymFile;
+
+        Monitor(SynonymFile synonymFile) {
+            this.synonymFile = synonymFile;
+        }
+
+        @Override
+        public void run() {
+            try {
+                logger.info("===== Monitor =======");
+                if (synonymFile.isNeedReloadSynonymMap()) {
+                    synonymMap = synonymFile.reloadSynonymMap();
+                    for (AbsSynonymFilter dynamicSynonymFilter : dynamicSynonymFilters.keySet()) {
+                        dynamicSynonymFilter.update(synonymMap);
+                        logger.debug("success reload synonym");
+                    }
+                }
+            } catch (Exception e) {
+                logger.info("Monitor error", e);
+//                e.printStackTrace();
+                logger.error(e);
+            }
+        }
+    }
+
+}

+ 164 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java

@@ -0,0 +1,164 @@
+/**
+ *
+ */
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.elasticsearch.env.Environment;
+
+
+/**
+ * @author bellszhu
+ */
+public class LocalSynonymFile implements SynonymFile {
+
+    private static final Logger logger = LogManager.getLogger("dynamic-synonym");
+
+    private String format;
+
+    private boolean expand;
+
+    private boolean lenient;
+
+    private Analyzer analyzer;
+
+    private Environment env;
+
+    /**
+     * Local file path relative to the config directory
+     */
+    private String location;
+
+    private Path synonymFilePath;
+
+    private long lastModified;
+
+    LocalSynonymFile(Environment env, Analyzer analyzer, boolean expand, boolean lenient,
+                     String format, String location) {
+        this.analyzer = analyzer;
+        this.expand = expand;
+        this.lenient = lenient;
+        this.format = format;
+        this.env = env;
+        this.location = location;
+
+        this.synonymFilePath = deepSearch();
+        isNeedReloadSynonymMap();
+    }
+
+    @Override
+    public SynonymMap reloadSynonymMap() {
+        try {
+            logger.debug("start reload local synonym from {}.", synonymFilePath);
+            Reader rulesReader = getReader();
+            SynonymMap.Builder parser = RemoteSynonymFile.getSynonymParser(
+                    rulesReader, format, expand, lenient, analyzer);
+            return parser.build();
+        } catch (Exception e) {
+            logger.error("reload local synonym {} error!", synonymFilePath, e);
+            throw new IllegalArgumentException(
+                    "could not reload local synonyms file to build synonyms", e);
+        }
+
+    }
+
+    /*
+    Just deleted when reading the file, Returns empty synonym
+      keyword if file not exists.
+    A small probability event.
+    */
+    public Reader getReader() {
+        if (!Files.exists(synonymFilePath)) {
+            return new StringReader("");
+        }
+        try (BufferedReader br = new BufferedReader(new InputStreamReader(
+                synonymFilePath.toUri().toURL().openStream(), StandardCharsets.UTF_8))) {
+            StringBuilder sb = new StringBuilder();
+            String line;
+            while ((line = br.readLine()) != null) {
+                // logger.info("reload local synonym: {}", line);
+                sb.append(line).append(System.getProperty("line.separator"));
+            }
+            return new StringReader(sb.toString());
+        } catch (IOException e) {
+            logger.error("get local synonym reader {} error!", location, e);
+//            throw new IllegalArgumentException(
+//                    "IOException while reading local synonyms file", e);
+//            Fix #54 Returns blank if synonym file has be deleted.
+            return new StringReader("");
+        }
+    }
+
+    @Override
+    public boolean isNeedReloadSynonymMap() {
+        try {
+            /*
+            If the file does not exist, it will be scanned every time
+              until the file is restored.
+             */
+            if (!Files.exists(synonymFilePath) && !Files.exists(synonymFilePath = deepSearch())) {
+                return false;
+            }
+            File synonymFile = synonymFilePath.toFile();
+            if (synonymFile.exists()
+                    && lastModified < synonymFile.lastModified()) {
+                lastModified = synonymFile.lastModified();
+                return true;
+            }
+        } catch (Exception e) {
+            logger.error("check need reload local synonym {} error!", location, e);
+        }
+
+        return false;
+    }
+
+    /**
+     * Deep search synonym file.
+     * Step 1. Query the 'sysnonym_path' parameter as an absolute path
+     * Step 2. Query the es config path
+     * Step 3. Query in current relative path
+     * <p>
+     * Override this method to expend search path
+     *
+     * @return the synonym path.
+     */
+    protected Path deepSearch() {
+        return env.configFile().resolve(location);
+//        // TODO
+//        SpecialPermission.check();
+//        return AccessController.doPrivileged((PrivilegedAction<Path>) () -> {
+//            return env.configFile().resolve(location);
+////            // access denied:java.io.FilePermission
+////            Path path;
+////            // Load setting config as absolute path
+////            if (Files.exists(Paths.get(location))) { // access denied:java.io.FilePermission
+////                path = Paths.get(location);
+////                // Load from setting config path
+////            } else if (Files.exists(env.configFile().resolve(location))) {
+////                path = env.configFile().resolve(location);
+////                // Load from current relative path
+////            } else {
+////                URL url = getClass().getClassLoader().getResource(location);
+////                if (url != null) {
+////                    path = Paths.get(url.getFile());
+////                } else {
+////                    path = env.configFile().resolve(location);
+////                }
+////            }
+////            return path;
+//        });
+    }
+}

+ 240 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java

@@ -0,0 +1,240 @@
+/**
+ *
+ */
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.classic.methods.HttpHead;
+import org.apache.hc.client5.http.classic.methods.HttpUriRequest;
+import org.apache.hc.client5.http.config.RequestConfig;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.core5.http.message.StatusLine;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.elasticsearch.analysis.common.ESSolrSynonymParser;
+import org.elasticsearch.analysis.common.ESWordnetSynonymParser;
+import org.elasticsearch.env.Environment;
+
+/**
+ * @author bellszhu
+ */
+public class RemoteSynonymFile implements SynonymFile {
+
+    private static final String LAST_MODIFIED_HEADER = "Last-Modified";
+    private static final String ETAG_HEADER = "ETag";
+
+    private static final Logger logger = LogManager.getLogger("dynamic-synonym");
+
+    private CloseableHttpClient httpclient;
+
+    private String format;
+
+    private boolean expand;
+
+    private boolean lenient;
+
+    private Analyzer analyzer;
+
+    private Environment env;
+
+    /**
+     * Remote URL address
+     */
+    private String location;
+
+    private String lastModified;
+
+    private String eTags;
+
+    RemoteSynonymFile(Environment env, Analyzer analyzer,
+                      boolean expand, boolean lenient, String format, String location) {
+        this.analyzer = analyzer;
+        this.expand = expand;
+        this.lenient = lenient;
+        this.format = format;
+        this.env = env;
+        this.location = location;
+
+        this.httpclient = HttpClients.createDefault();
+
+        isNeedReloadSynonymMap();
+    }
+
+    static SynonymMap.Builder getSynonymParser(
+            Reader rulesReader, String format, boolean expand, boolean lenient, Analyzer analyzer
+    ) throws IOException, ParseException {
+        SynonymMap.Builder parser;
+        if ("wordnet".equalsIgnoreCase(format)) {
+            parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
+            ((ESWordnetSynonymParser) parser).parse(rulesReader);
+        } else {
+            parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
+            ((ESSolrSynonymParser) parser).parse(rulesReader);
+        }
+        return parser;
+    }
+
+    @Override
+    public SynonymMap reloadSynonymMap() {
+        Reader rulesReader = null;
+        try {
+            logger.debug("start reload remote synonym from {}.", location);
+            rulesReader = getReader();
+            SynonymMap.Builder parser;
+
+            parser = getSynonymParser(rulesReader, format, expand, lenient, analyzer);
+            return parser.build();
+        } catch (Exception e) {
+            logger.error("reload remote synonym {} error!", location, e);
+            throw new IllegalArgumentException(
+                    "could not reload remote synonyms file to build synonyms",
+                    e);
+        } finally {
+            if (rulesReader != null) {
+                try {
+                    rulesReader.close();
+                } catch (Exception e) {
+                    logger.error("failed to close rulesReader", e);
+                }
+            }
+        }
+    }
+
+    private CloseableHttpResponse executeHttpRequest(HttpUriRequest httpUriRequest) {
+        try {
+            return httpclient.execute(httpUriRequest);
+        } catch (IOException e) {
+            logger.error("Unable to execute HTTP request.", e);
+        }
+        return null;
+    }
+
+    /**
+     * Download custom terms from a remote server
+     */
+    public Reader getReader() {
+        Reader reader;
+        RequestConfig rc = RequestConfig.custom()
+                .setConnectionRequestTimeout(10 * 1000, TimeUnit.MILLISECONDS)
+                .setResponseTimeout(60 * 1000, TimeUnit.MILLISECONDS)
+                .build();
+        CloseableHttpResponse response = null;
+        BufferedReader br = null;
+        HttpGet get = new HttpGet(location);
+        get.setConfig(rc);
+        try {
+            response = executeHttpRequest(get);
+            assert response != null;
+            StatusLine statusLine = new StatusLine(response);
+            if (statusLine.getStatusCode() == 200) {
+                String charset = "UTF-8"; // 获取编码,默认为utf-8
+                if (response.getEntity().getContentType().contains("charset=")) {
+                    String contentType = response.getEntity().getContentType();
+                    charset = contentType.substring(contentType
+                            .lastIndexOf('=') + 1);
+                }
+
+                br = new BufferedReader(new InputStreamReader(response
+                        .getEntity().getContent(), charset));
+                StringBuilder sb = new StringBuilder();
+                String line;
+                while ((line = br.readLine()) != null) {
+                    logger.debug("reload remote synonym: {}", line);
+                    sb.append(line)
+                            .append(System.getProperty("line.separator"));
+                }
+                reader = new StringReader(sb.toString());
+            } else reader = new StringReader("");
+        } catch (Exception e) {
+            logger.error("get remote synonym reader {} error!", location, e);
+//            throw new IllegalArgumentException(
+//                    "Exception while reading remote synonyms file", e);
+            // Fix #54 Returns blank if synonym file has be deleted.
+            reader = new StringReader("1=>1");
+        } finally {
+            try {
+                if (br != null) {
+                    br.close();
+                }
+            } catch (IOException e) {
+                logger.error("failed to close bufferedReader", e);
+            }
+            try {
+                if (response != null) {
+                    response.close();
+                }
+            } catch (IOException e) {
+                logger.error("failed to close http response", e);
+            }
+        }
+        return reader;
+    }
+
+    @Override
+    public boolean isNeedReloadSynonymMap() {
+        logger.info("==== isNeedReloadSynonymMap ====");
+        RequestConfig rc = RequestConfig.custom()
+                .setConnectionRequestTimeout(10 * 1000, TimeUnit.MILLISECONDS)
+                .setResponseTimeout(15 * 1000, TimeUnit.MILLISECONDS)
+                .build();
+        HttpHead head = new HttpHead(location);
+        head.setConfig(rc);
+
+        // 设置请求头
+        if (lastModified != null) {
+            head.setHeader("If-Modified-Since", lastModified);
+        }
+        if (eTags != null) {
+            head.setHeader("If-None-Match", eTags);
+        }
+
+        CloseableHttpResponse response = null;
+        try {
+            response = executeHttpRequest(head);
+            assert response != null;
+            StatusLine statusLine = new StatusLine(response);
+            if (statusLine.getStatusCode() == 200) { // 返回200 才做操作
+                if (!response.getLastHeader(LAST_MODIFIED_HEADER).getValue()
+                        .equalsIgnoreCase(lastModified)
+                        || !response.getLastHeader(ETAG_HEADER).getValue()
+                        .equalsIgnoreCase(eTags)) {
+
+                    lastModified = response.getLastHeader(LAST_MODIFIED_HEADER) == null ? null
+                            : response.getLastHeader(LAST_MODIFIED_HEADER)
+                            .getValue();
+                    eTags = response.getLastHeader(ETAG_HEADER) == null ? null
+                            : response.getLastHeader(ETAG_HEADER).getValue();
+                    return true;
+                }
+            } else if (statusLine.getStatusCode() == 304) {
+                return false;
+            } else {
+                logger.info("remote synonym {} return bad code {}", location,
+                        statusLine.getStatusCode());
+            }
+        } catch (Exception e){
+            return false;
+        } finally {
+            try {
+                if (response != null) {
+                    response.close();
+                }
+            } catch (IOException e) {
+                logger.error("failed to close http response", e);
+            }
+        }
+        return false;
+    }
+}

+ 21 - 0
src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/SynonymFile.java

@@ -0,0 +1,21 @@
+/**
+ *
+ */
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.synonym.SynonymMap;
+
+/**
+ * @author bellszhu
+ */
+public interface SynonymFile {
+
+    SynonymMap reloadSynonymMap();
+
+    boolean isNeedReloadSynonymMap();
+
+    Reader getReader();
+
+}

+ 80 - 0
src/main/resources/plugin-descriptor.properties

@@ -0,0 +1,80 @@
+# Elasticsearch plugin descriptor file
+# This file must exist as 'plugin-descriptor.properties' at
+# the root directory of all plugins.
+#
+# A plugin can be 'site', 'jvm', or both.
+#
+### example site plugin for "foo":
+#
+# foo.zip <-- zip file for the plugin, with this structure:
+#   _site/ <-- the contents that will be served
+#   plugin-descriptor.properties <-- example contents below:
+#
+# site=true
+# description=My cool plugin
+# version=1.0
+#
+### example jvm plugin for "foo"
+#
+# foo.zip <-- zip file for the plugin, with this structure:
+#   <arbitrary name1>.jar <-- classes, resources, dependencies
+#   <arbitrary nameN>.jar <-- any number of jars
+#   plugin-descriptor.properties <-- example contents below:
+#
+# jvm=true
+# classname=foo.bar.BazPlugin
+# description=My cool plugin
+# version=2.0.0-rc1
+# elasticsearch.version=2.0
+# java.version=1.7
+#
+### mandatory elements for all plugins:
+#
+# 'description': simple summary of the plugin
+description=${project.description}
+#
+# 'version': plugin's version
+version=${project.version}
+#
+# 'name': the plugin name
+name=${elasticsearch.plugin.name}
+
+### mandatory elements for site plugins:
+#
+# 'site': set to true to indicate contents of the _site/
+#  directory in the root of the plugin should be served.
+# site=${elasticsearch.plugin.site}
+#
+### mandatory elements for jvm plugins :
+#
+# 'jvm': true if the 'classname' class should be loaded
+#  from jar files in the root directory of the plugin.
+#  Note that only jar files in the root directory are
+#  added to the classpath for the plugin! If you need
+#  other resources, package them into a resources jar.
+# jvm=${elasticsearch.plugin.jvm}
+#
+# 'classname': the name of the class to load, fully-qualified.
+classname=${elasticsearch.plugin.classname}
+#
+# 'java.version' version of java the code is built against
+# use the system property java.specification.version
+# version string must be a sequence of nonnegative decimal integers
+# separated by "."'s and may have leading zeros
+java.version=${maven.compiler.target}
+#
+# 'elasticsearch.version' version of elasticsearch compiled against
+# You will have to release a new version of the plugin for each new
+# elasticsearch release. This version is checked when the plugin
+# is loaded so Elasticsearch will refuse to start in the presence of
+# plugins with the incorrect elasticsearch.version.
+elasticsearch.version=${elasticsearch.version}
+#
+### deprecated elements for jvm plugins :
+#
+# 'isolated': true if the plugin should have its own classloader.
+# passing false is deprecated, and only intended to support plugins 
+# that have hard dependencies against each other. If this is
+# not specified, then the plugin is isolated by default.
+# isolated=${elasticsearch.plugin.isolated}
+#

+ 3 - 0
src/main/resources/plugin-security.policy

@@ -0,0 +1,3 @@
+grant {
+  permission java.net.SocketPermission "*", "connect,resolve";
+};

文件差異過大導致無法顯示
+ 59702 - 0
ti_synonym.txt