Bläddra i källkod

拼音分词插件源码

liwh 1 månad sedan
incheckning
14577ca3a3
25 ändrade filer med 4156 tillägg och 0 borttagningar
  1. 2 0
      .gitignore
  2. 202 0
      LICENSE.txt
  3. 378 0
      README.md
  4. 8 0
      elasticsearch-analysis-pinyin.iml
  5. BIN
      lib/nlp-lang-1.7.jar
  6. 295 0
      pom.xml
  7. 39 0
      src/main/assemblies/plugin.xml
  8. 49 0
      src/main/java/org/elasticsearch/analysis/PinyinConfig.java
  9. 38 0
      src/main/java/org/elasticsearch/index/analysis/ChineseUtil.java
  10. 15 0
      src/main/java/org/elasticsearch/index/analysis/ConfigErrorException.java
  11. 29 0
      src/main/java/org/elasticsearch/index/analysis/PinyinAbbreviationsTokenizerFactory.java
  12. 202 0
      src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java
  13. 26 0
      src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzer.java
  14. 27 0
      src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzerProvider.java
  15. 336 0
      src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java
  16. 23 0
      src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilterFactory.java
  17. 336 0
      src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java
  18. 23 0
      src/main/java/org/elasticsearch/index/analysis/PinyinTokenizerFactory.java
  19. 31 0
      src/main/java/org/elasticsearch/index/analysis/TermItem.java
  20. 36 0
      src/main/java/org/elasticsearch/plugin/analysis/pinyin/AnalysisPinyinPlugin.java
  21. 440 0
      src/main/resources/pinyin_alphabet.dict
  22. 57 0
      src/main/resources/plugin-descriptor.properties
  23. 30 0
      src/test/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizerTest.java
  24. 1529 0
      src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java
  25. 5 0
      src/test/resources/log4j.properties

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+.idea/
+target/

+ 202 - 0
LICENSE.txt

@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 378 - 0
README.md

@@ -0,0 +1,378 @@
+Pinyin Analysis for Elasticsearch
+==================================
+
+This Pinyin Analysis plugin is used to do conversion between Chinese characters and Pinyin, integrates NLP tools (https://github.com/NLPchina/nlp-lang).
+
+    --------------------------------------------------
+    | Pinyin Analysis Plugin        | Elasticsearch  |
+    --------------------------------------------------
+    | master                        | 7.x -> master  |
+    --------------------------------------------------
+    | 6.x                           | 6.x            |
+    --------------------------------------------------  
+    | 5.x                           | 5.x            |
+    --------------------------------------------------  
+    | 1.8.1                         | 2.4.1          |
+    --------------------------------------------------  
+    | 1.7.5                         | 2.3.5          |
+    --------------------------------------------------  
+    | 1.6.1                         | 2.2.1          |
+    --------------------------------------------------
+    | 1.5.0                         | 2.1.0          |
+    --------------------------------------------------
+    | 1.4.0                         | 2.0.x          |
+    --------------------------------------------------
+    | 1.3.0                         | 1.6.x          |
+    --------------------------------------------------
+    | 1.2.2                         | 1.0.x          |
+    --------------------------------------------------
+
+The plugin includes analyzer: `pinyin` ,  tokenizer: `pinyin` and  token-filter:  `pinyin`.
+
+** Optional Parameters ** 
+* `keep_first_letter` when this option enabled,  eg: `刘德华`>`ldh`, default: true
+* `keep_separate_first_letter` when this option enabled, will keep first letters separately,  eg: `刘德华`>`l`,`d`,`h`, default: false, NOTE: query result maybe too fuzziness due to term too frequency
+* `limit_first_letter_length` set max length of the first_letter result, default: 16
+* `keep_full_pinyin` when this option enabled, eg: `刘德华`> [`liu`,`de`,`hua`], default: true
+* `keep_joined_full_pinyin` when this option enabled, eg: `刘德华`> [`liudehua`], default: false
+* `keep_none_chinese` keep non chinese letter or number in result, default: true
+* `keep_none_chinese_together` keep non chinese letter together, default: true, eg: `DJ音乐家` -> `DJ`,`yin`,`yue`,`jia`, when set to `false`, eg: `DJ音乐家` -> `D`,`J`,`yin`,`yue`,`jia`, NOTE: `keep_none_chinese` should be enabled first
+* `keep_none_chinese_in_first_letter` keep non Chinese letters in first letter, eg: `刘德华AT2016`->`ldhat2016`, default: true
+* `keep_none_chinese_in_joined_full_pinyin` keep non Chinese letters in joined full pinyin, eg: `刘德华2016`->`liudehua2016`, default: false
+* `none_chinese_pinyin_tokenize` break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: `liudehuaalibaba13zhuanghan` -> `liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`, NOTE:  `keep_none_chinese` and `keep_none_chinese_together` should be enabled first
+* `keep_original` when this option enabled, will keep original input as well, default: false
+* `lowercase`  lowercase non Chinese letters, default: true
+* `trim_whitespace` default: true
+* `remove_duplicated_term` when this option enabled, duplicated term will be removed to save index, eg: `de的`>`de`, default: false,  NOTE: position related query maybe influenced
+* `ignore_pinyin_offset` after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
+
+
+
+1.Create a index with custom pinyin analyzer
+<pre>
+PUT /medcl/ 
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "pinyin_analyzer" : {
+                    "tokenizer" : "my_pinyin"
+                    }
+            },
+            "tokenizer" : {
+                "my_pinyin" : {
+                    "type" : "pinyin",
+                    "keep_separate_first_letter" : false,
+                    "keep_full_pinyin" : true,
+                    "keep_original" : true,
+                    "limit_first_letter_length" : 16,
+                    "lowercase" : true,
+                    "remove_duplicated_term" : true
+                }
+            }
+        }
+    }
+}
+</pre>
+
+2.Test Analyzer, analyzing a chinese name, such as 刘德华
+<pre>
+GET /medcl/_analyze
+{
+  "text": ["刘德华"],
+  "analyzer": "pinyin_analyzer"
+}</pre>
+<pre>
+{
+  "tokens" : [
+    {
+      "token" : "liu",
+      "start_offset" : 0,
+      "end_offset" : 1,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "de",
+      "start_offset" : 1,
+      "end_offset" : 2,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "hua",
+      "start_offset" : 2,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 2
+    },
+    {
+      "token" : "刘德华",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 3
+    },
+    {
+      "token" : "ldh",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 4
+    }
+  ]
+}
+</pre>
+
+3.Create mapping
+<pre>
+POST /medcl/_mapping 
+{
+        "properties": {
+            "name": {
+                "type": "keyword",
+                "fields": {
+                    "pinyin": {
+                        "type": "text",
+                        "store": false,
+                        "term_vector": "with_offsets",
+                        "analyzer": "pinyin_analyzer",
+                        "boost": 10
+                    }
+                }
+            }
+        }
+    
+}
+</pre>
+
+4.Indexing
+<pre>
+POST /medcl/_create/andy
+{"name":"刘德华"}
+</pre>
+
+5.Let's search
+
+<pre>
+
+curl http://localhost:9200/medcl/_search?q=name:%E5%88%98%E5%BE%B7%E5%8D%8E
+curl http://localhost:9200/medcl/_search?q=name.pinyin:%e5%88%98%e5%be%b7
+curl http://localhost:9200/medcl/_search?q=name.pinyin:liu
+curl http://localhost:9200/medcl/_search?q=name.pinyin:ldh
+curl http://localhost:9200/medcl/_search?q=name.pinyin:de+hua
+
+</pre>
+
+6.Using Pinyin-TokenFilter
+<pre>
+PUT /medcl1/ 
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "user_name_analyzer" : {
+                    "tokenizer" : "whitespace",
+                    "filter" : "pinyin_first_letter_and_full_pinyin_filter"
+                }
+            },
+            "filter" : {
+                "pinyin_first_letter_and_full_pinyin_filter" : {
+                    "type" : "pinyin",
+                    "keep_first_letter" : true,
+                    "keep_full_pinyin" : false,
+                    "keep_none_chinese" : true,
+                    "keep_original" : false,
+                    "limit_first_letter_length" : 16,
+                    "lowercase" : true,
+                    "trim_whitespace" : true,
+                    "keep_none_chinese_in_first_letter" : true
+                }
+            }
+        }
+    }
+}
+</pre>
+
+Token Test:刘德华 张学友 郭富城 黎明 四大天王
+<pre>
+GET /medcl1/_analyze
+{
+  "text": ["刘德华 张学友 郭富城 黎明 四大天王"],
+  "analyzer": "user_name_analyzer"
+}
+</pre>
+<pre>
+{
+  "tokens" : [
+    {
+      "token" : "ldh",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "zxy",
+      "start_offset" : 4,
+      "end_offset" : 7,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "gfc",
+      "start_offset" : 8,
+      "end_offset" : 11,
+      "type" : "word",
+      "position" : 2
+    },
+    {
+      "token" : "lm",
+      "start_offset" : 12,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 3
+    },
+    {
+      "token" : "sdtw",
+      "start_offset" : 15,
+      "end_offset" : 19,
+      "type" : "word",
+      "position" : 4
+    }
+  ]
+}
+</pre>
+
+
+7.Used in phrase query
+
+- option 1
+
+<pre>
+PUT /medcl2/
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "pinyin_analyzer" : {
+                    "tokenizer" : "my_pinyin"
+                    }
+            },
+            "tokenizer" : {
+                "my_pinyin" : {
+                    "type" : "pinyin",
+                    "keep_first_letter":false,
+                    "keep_separate_first_letter" : false,
+                    "keep_full_pinyin" : true,
+                    "keep_original" : false,
+                    "limit_first_letter_length" : 16,
+                    "lowercase" : true
+                }
+            }
+        }
+    }
+}
+GET /medcl2/_search
+{
+  "query": {"match_phrase": {
+    "name.pinyin": "刘德华"
+  }}
+}
+
+</pre>
+
+- option 2
+
+<pre>
+ 
+PUT /medcl3/
+{
+   "settings" : {
+       "analysis" : {
+           "analyzer" : {
+               "pinyin_analyzer" : {
+                   "tokenizer" : "my_pinyin"
+                   }
+           },
+           "tokenizer" : {
+               "my_pinyin" : {
+                   "type" : "pinyin",
+                   "keep_first_letter":true,
+                   "keep_separate_first_letter" : true,
+                   "keep_full_pinyin" : true,
+                   "keep_original" : false,
+                   "limit_first_letter_length" : 16,
+                   "lowercase" : true
+               }
+           }
+       }
+   }
+}
+   
+POST /medcl3/_mapping 
+{
+  "properties": {
+      "name": {
+          "type": "keyword",
+          "fields": {
+              "pinyin": {
+                  "type": "text",
+                  "store": false,
+                  "term_vector": "with_offsets",
+                  "analyzer": "pinyin_analyzer",
+                  "boost": 10
+              }
+          }
+      }
+  }
+}
+  
+   
+GET /medcl3/_analyze
+{
+   "text": ["刘德华"],
+   "analyzer": "pinyin_analyzer"
+}
+ 
+POST /medcl3/_create/andy
+{"name":"刘德华"}
+
+GET /medcl3/_search
+{
+ "query": {"match_phrase": {
+   "name.pinyin": "刘德h"
+ }}
+}
+
+GET /medcl3/_search
+{
+ "query": {"match_phrase": {
+   "name.pinyin": "刘dh"
+ }}
+}
+
+GET /medcl3/_search
+{
+ "query": {"match_phrase": {
+   "name.pinyin": "liudh"
+ }}
+}
+
+GET /medcl3/_search
+{
+ "query": {"match_phrase": {
+   "name.pinyin": "liudeh"
+ }}
+}
+
+GET /medcl3/_search
+{
+ "query": {"match_phrase": {
+   "name.pinyin": "liude华"
+ }}
+}
+
+</pre>
+
+8.That's all, have fun.

+ 8 - 0
elasticsearch-analysis-pinyin.iml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module version="4">
+  <component name="CheckStyle-IDEA-Module">
+    <option name="configuration">
+      <map />
+    </option>
+  </component>
+</module>

BIN
lib/nlp-lang-1.7.jar


+ 295 - 0
pom.xml

@@ -0,0 +1,295 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <name>elasticsearch-analysis-pinyin</name>
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>org.elasticsearch</groupId>
+    <artifactId>elasticsearch-analysis-pinyin</artifactId>
+    <version>${elasticsearch.version}</version>
+    <packaging>jar</packaging>
+    <description>Pinyin Analysis for Elasticsearch</description>
+    <inceptionYear>2012</inceptionYear>
+
+    <properties>
+        <elasticsearch.version>7.14.0</elasticsearch.version>
+        <maven.compiler.target>16</maven.compiler.target>
+        <elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>
+        <elasticsearch.plugin.name>analysis-pinyin</elasticsearch.plugin.name>
+        <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.pinyin.AnalysisPinyinPlugin</elasticsearch.plugin.classname>
+        <elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
+        <tests.rest.load_packaged>false</tests.rest.load_packaged>
+        <skip.unit.tests>true</skip.unit.tests>
+        <gpg.keyname>4E899B30</gpg.keyname>
+        <gpg.useagent>true</gpg.useagent>
+    </properties>
+
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <developers>
+        <developer>
+            <name>Medcl</name>
+            <email>medcl@elastic.co</email>
+            <organization>elastic</organization>
+            <organizationUrl>http://www.elastic.co</organizationUrl>
+        </developer>
+    </developers>
+
+    <scm>
+        <connection>scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git</connection>
+        <developerConnection>scm:git:git@github.com:medcl/elasticsearch-analysis-pinyin.git
+        </developerConnection>
+        <url>http://github.com/medcl/elasticsearch-analysis-pinyin</url>
+    </scm>
+
+    <parent>
+        <groupId>org.sonatype.oss</groupId>
+        <artifactId>oss-parent</artifactId>
+        <version>9</version>
+    </parent>
+
+    <distributionManagement>
+        <snapshotRepository>
+            <id>oss.sonatype.org</id>
+            <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+        <repository>
+            <id>oss.sonatype.org</id>
+            <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+        </repository>
+    </distributionManagement>
+
+    <repositories>
+        <repository>
+            <id>oss.sonatype.org</id>
+            <name>OSS Sonatype</name>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+            <url>http://oss.sonatype.org/content/repositories/releases/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.nlpcn</groupId>
+            <artifactId>nlp-lang</artifactId>
+            <version>1.7</version>
+            <systemPath>${basedir}/lib/nlp-lang-1.7.jar</systemPath>
+            <scope>system</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.elasticsearch</groupId>
+            <artifactId>elasticsearch</artifactId>
+            <version>${elasticsearch.version}</version>
+            <scope>compile</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+            <version>1.2.17</version>
+            <scope>runtime</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest-core</artifactId>
+            <version>1.3.RC2</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest-library</artifactId>
+            <version>1.3.RC2</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.powermock</groupId>
+            <artifactId>powermock-module-junit4</artifactId>
+            <version>1.6.2</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.powermock</groupId>
+            <artifactId>powermock-api-mockito</artifactId>
+            <version>1.6.2</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>nl.jqno.equalsverifier</groupId>
+            <artifactId>equalsverifier</artifactId>
+            <version>1.7.5</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>com.openpojo</groupId>
+            <artifactId>openpojo</artifactId>
+            <version>0.8.1</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.9</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.5.1</version>
+                <configuration>
+                    <source>${maven.compiler.target}</source>
+                    <target>${maven.compiler.target}</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>2.1.2</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>2.3</version>
+                <configuration>
+                    <appendAssemblyId>false</appendAssemblyId>
+                    <outputDirectory>${project.build.directory}/releases/</outputDirectory>
+                    <descriptors>
+                        <descriptor>${basedir}/src/main/assemblies/plugin.xml</descriptor>
+                    </descriptors>
+                </configuration>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <profiles>
+        <profile>
+            <id>disable-java8-doclint</id>
+            <activation>
+                <jdk>[1.8,)</jdk>
+            </activation>
+            <properties>
+                <additionalparam>-Xdoclint:none</additionalparam>
+            </properties>
+        </profile>
+        <profile>
+            <id>release</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.sonatype.plugins</groupId>
+                        <artifactId>nexus-staging-maven-plugin</artifactId>
+                        <version>1.6.3</version>
+                        <extensions>true</extensions>
+                        <configuration>
+                            <serverId>oss</serverId>
+                            <nexusUrl>https://oss.sonatype.org/</nexusUrl>
+                            <autoReleaseAfterClose>true</autoReleaseAfterClose>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-release-plugin</artifactId>
+                        <version>2.1</version>
+                        <configuration>
+                            <autoVersionSubmodules>true</autoVersionSubmodules>
+                            <useReleaseProfile>false</useReleaseProfile>
+                            <releaseProfiles>release</releaseProfiles>
+                            <goals>deploy</goals>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-compiler-plugin</artifactId>
+                        <version>3.5.1</version>
+                        <configuration>
+                            <source>${maven.compiler.target}</source>
+                            <target>${maven.compiler.target}</target>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-gpg-plugin</artifactId>
+                        <version>1.5</version>
+                        <executions>
+                            <execution>
+                                <id>sign-artifacts</id>
+                                <phase>verify</phase>
+                                <goals>
+                                    <goal>sign</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-source-plugin</artifactId>
+                        <version>2.2.1</version>
+                        <executions>
+                            <execution>
+                                <id>attach-sources</id>
+                                <goals>
+                                    <goal>jar-no-fork</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-javadoc-plugin</artifactId>
+                        <version>2.9</version>
+                        <executions>
+                            <execution>
+                                <id>attach-javadocs</id>
+                                <goals>
+                                    <goal>jar</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
+</project>

+ 39 - 0
src/main/assemblies/plugin.xml

@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<assembly>
+    <id>plugin</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <files>
+        <file>
+            <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
+            <outputDirectory/>
+            <filtered>true</filtered>
+        </file>
+    </files>
+    <dependencySets>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <excludes>
+                <exclude>org.elasticsearch:elasticsearch</exclude>
+            </excludes>
+        </dependencySet>
+        <dependencySet>
+            <outputDirectory>/</outputDirectory>
+            <useProjectArtifact>true</useProjectArtifact>
+            <useTransitiveFiltering>true</useTransitiveFiltering>
+            <includes>
+                <include>org.apache.lucene:lucene-pinyin</include>
+            </includes>
+        </dependencySet>
+    </dependencySets>
+   <fileSets>
+        <fileSet>
+            <directory>${basedir}/lib/</directory>
+            <outputDirectory>/</outputDirectory>
+        </fileSet>
+    </fileSets>
+</assembly>

+ 49 - 0
src/main/java/org/elasticsearch/analysis/PinyinConfig.java

@@ -0,0 +1,49 @@
+package org.elasticsearch.analysis;
+
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * Created by medcl on 15/11/26.
+ */
+public class PinyinConfig {
+
+    public boolean lowercase=true;
+    public boolean trimWhitespace=true;
+    public boolean keepNoneChinese=true;
+    public boolean keepNoneChineseInFirstLetter =true;
+    public boolean keepNoneChineseInJoinedFullPinyin =false;
+    public boolean keepOriginal=false;
+    public boolean keepFirstLetter=true;
+    public boolean keepSeparateFirstLetter=false;
+    public boolean keepNoneChineseTogether=true;
+    public boolean noneChinesePinyinTokenize =true;
+    public int     LimitFirstLetterLength=16;
+    public boolean keepFullPinyin=true;
+    public boolean keepJoinedFullPinyin =false;
+    public boolean removeDuplicateTerm=false;
+    public boolean fixedPinyinOffset =false;
+    //  after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true.
+    public boolean ignorePinyinOffset =true;
+    public  boolean keepSeparateChinese=false;
+
+    public PinyinConfig() {}
+    public PinyinConfig(Settings settings) {
+        this.keepFirstLetter=settings.getAsBoolean("keep_first_letter",true);
+        this.keepSeparateFirstLetter=settings.getAsBoolean("keep_separate_first_letter",false);
+        this.keepFullPinyin=settings.getAsBoolean("keep_full_pinyin", true);
+        this.keepJoinedFullPinyin =settings.getAsBoolean("keep_joined_full_pinyin", false);
+        this.keepNoneChinese=settings.getAsBoolean("keep_none_chinese",true);
+        this.keepNoneChineseTogether=settings.getAsBoolean("keep_none_chinese_together",true);
+        this.noneChinesePinyinTokenize =settings.getAsBoolean("none_chinese_pinyin_tokenize",true);
+        this.keepOriginal=settings.getAsBoolean("keep_original", false);
+        this.LimitFirstLetterLength=settings.getAsInt("limit_first_letter_length", 16);
+        this.lowercase=settings.getAsBoolean("lowercase", true);
+        this.trimWhitespace=settings.getAsBoolean("trim_whitespace", true);
+        this.keepNoneChineseInFirstLetter =settings.getAsBoolean("keep_none_chinese_in_first_letter", true);
+        this.keepNoneChineseInJoinedFullPinyin =settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false);
+        this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false);
+        this.fixedPinyinOffset =settings.getAsBoolean("fixed_pinyin_offset", false);
+        this.ignorePinyinOffset =settings.getAsBoolean("ignore_pinyin_offset", true);
+        this.keepSeparateChinese=settings.getAsBoolean("keep_separate_chinese", false);
+    }
+}

+ 38 - 0
src/main/java/org/elasticsearch/index/analysis/ChineseUtil.java

@@ -0,0 +1,38 @@
+package org.elasticsearch.index.analysis;
+
+import org.nlpcn.commons.lang.util.StringUtil;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+
+public class ChineseUtil {
+    /**
+     * 汉字始
+     */
+    public static char CJK_UNIFIED_IDEOGRAPHS_START = '\u4E00';
+    /**
+     * 汉字止
+     */
+    public static char CJK_UNIFIED_IDEOGRAPHS_END = '\u9FA5';
+
+    public static List<String> segmentChinese(String str){
+        if (StringUtil.isBlank(str)) {
+            return Collections.emptyList();
+        }
+
+        List<String> lists = str.length()<=32767?new ArrayList<>(str.length()):new LinkedList<>();
+        for (int i=0;i<str.length();i++){
+            char c = str.charAt(i);
+            if(c>=CJK_UNIFIED_IDEOGRAPHS_START&&c<=CJK_UNIFIED_IDEOGRAPHS_END){
+                lists.add(String.valueOf(c));
+            }
+            else{
+                lists.add(null);
+            }
+
+        }
+        return lists;
+    }
+}

+ 15 - 0
src/main/java/org/elasticsearch/index/analysis/ConfigErrorException.java

@@ -0,0 +1,15 @@
+package org.elasticsearch.index.analysis;
+
+/**
+ * Created by medcl on 16/8/22.
+ */
+public class ConfigErrorException extends RuntimeException {
+    private final String mesage;
+
+    public ConfigErrorException(String message) {
+        this.mesage=message;
+    }
+    public String getMessage() {
+        return this.mesage;
+    }
+}

+ 29 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinAbbreviationsTokenizerFactory.java

@@ -0,0 +1,29 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class PinyinAbbreviationsTokenizerFactory extends AbstractTokenizerFactory {
+
+    public PinyinAbbreviationsTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, settings, name);
+    }
+
+    @Override
+    public Tokenizer create() {
+        PinyinConfig config=new PinyinConfig();
+        config.keepFirstLetter=true;
+        config.keepFullPinyin=false;
+        config.keepNoneChinese=false;
+        config.keepNoneChineseTogether=true;
+        config.noneChinesePinyinTokenize=false;
+        config.keepOriginal=false;
+        config.lowercase=true;
+        config.trimWhitespace=true;
+        config.keepNoneChineseInFirstLetter=true;
+        return new PinyinTokenizer(config);
+    }
+}

+ 202 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java

@@ -0,0 +1,202 @@
+package org.elasticsearch.index.analysis;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.*;
+
+/**
+ * Created by medcl on 16/10/13.
+ */
+public class PinyinAlphabetTokenizer {
+
+    private static final int PINYIN_MAX_LENGTH = 6;
+
+    public static List<String> walk(String text) {
+        return segPinyinStr(text);
+    }
+
+    private static List<String> segPinyinStr(String content) {
+        String pinyinStr = content;
+        pinyinStr = pinyinStr.toLowerCase();
+        // 按非letter切分
+        List<String> pinyinStrList = splitByNoletter(pinyinStr);
+        List<String> pinyinList = new ArrayList<>();
+        for (String pinyinText : pinyinStrList) {
+            if (pinyinText.length() == 1) {
+                pinyinList.add(pinyinText);
+            } else {
+                List<String> forward = positiveMaxMatch(pinyinText, PINYIN_MAX_LENGTH);
+                if (forward.size() == 1) { // 前向只切出1个的话,没有必要再做逆向分词
+                    pinyinList.addAll(forward);
+                } else {
+                    // 分别正向、逆向最大匹配,选出最短的作为最优结果
+                    List<String> backward = reverseMaxMatch(pinyinText, PINYIN_MAX_LENGTH);
+                    if (forward.size() <= backward.size()) {
+                        pinyinList.addAll(forward);
+                    } else {
+                        pinyinList.addAll(backward);
+                    }
+                }
+            }
+        }
+        return pinyinList;
+    }
+
+    private static List<String> splitByNoletter(String pinyinStr) {
+        List<String> pinyinStrList = new ArrayList<>();
+        StringBuffer sb = new StringBuffer();
+        boolean lastWord = true;
+        for (char c : pinyinStr.toCharArray()) {
+            if ((c > 96 && c < 123) || (c > 64 && c < 91)) {
+                if (!lastWord){
+                    pinyinStrList.add(sb.toString());
+                    sb.setLength(0);
+                }
+                sb.append(c);
+                lastWord = true;
+            } else {
+                if (lastWord & sb.length()>0) {
+                    pinyinStrList.add(sb.toString());
+                    sb.setLength(0);
+                }
+                sb.append(c);
+                lastWord = false;
+            }
+        }
+        if (sb.length() > 0) {
+            pinyinStrList.add(sb.toString());
+        }
+        return pinyinStrList;
+
+    }
+
+    private static List<String> positiveMaxMatch(String pinyinText, int maxLength) {
+
+        List<String> pinyinList = new ArrayList<>();
+        StringBuffer noMatchBuffer = new StringBuffer();
+        for (int start = 0; start < pinyinText.length(); ) {
+            int end = start + maxLength;
+            if (end > pinyinText.length()) {
+                end = pinyinText.length();
+            }
+            if (start == end) {
+                break;
+            }
+            String sixStr = pinyinText.substring(start, end);
+            boolean match = false;
+            for (int j = 0; j < sixStr.length(); j++) {
+                String guess = sixStr.substring(0, sixStr.length() - j);
+                if (PinyinAlphabetDict.getInstance().match(guess)) {
+                    pinyinList.add(guess);
+                    start += guess.length();
+                    match = true;
+                    break;
+                }
+            }
+            if (!match) { //没命中,向后移动一位
+                noMatchBuffer.append(sixStr.substring(0, 1));
+                start++;
+            }else { // 命中,加上之前没命中的,并清空
+                if (noMatchBuffer.length() > 0) {
+                    pinyinList.add(noMatchBuffer.toString());
+                    noMatchBuffer.setLength(0);
+                }
+            }
+        }
+        if (noMatchBuffer.length() > 0) {
+            pinyinList.add(noMatchBuffer.toString());
+            noMatchBuffer.setLength(0);
+        }
+
+        return pinyinList;
+    }
+
+    private static List<String> reverseMaxMatch(String pinyinText, int maxLength) {
+        List<String> pinyinList = new ArrayList<>();
+        StringBuffer noMatchBuffer = new StringBuffer();
+        for (int end = pinyinText.length(); end >= 0; ) {
+            int start = end - maxLength;
+            if (start < 0) {
+                start = 0;
+            }
+            if (start == end) {
+                break;
+            }
+            boolean match = false;
+            String sixStr = pinyinText.substring(start, end);
+            for (int j = 0; j < sixStr.length(); j++) {
+                String guess = sixStr.substring(j);
+                if (PinyinAlphabetDict.getInstance().match(guess)) {
+                    pinyinList.add(guess);
+                    end -= guess.length();
+                    match = true;
+                    break;
+                }
+            }
+            if (!match) { //一个也没命中
+                noMatchBuffer.append(sixStr.substring(sixStr.length() - 1));
+                end--;
+            } else {
+                if (noMatchBuffer.length() > 0) {
+                    pinyinList.add(noMatchBuffer.toString());
+                    noMatchBuffer.setLength(0);
+                }
+            }
+        }
+
+        if (noMatchBuffer.length() > 0) {
+            pinyinList.add(noMatchBuffer.toString());
+            noMatchBuffer.setLength(0);
+        }
+        // reverse 保持切词顺序
+        Collections.reverse(pinyinList);
+        return pinyinList;
+    }
+
+
+}
+
+ class PinyinAlphabetDict {
+
+    private static final String fileName = "/pinyin_alphabet.dict";
+
+    private Set<String> alphabet = new HashSet<String>();
+
+    private static PinyinAlphabetDict instance;
+
+    private PinyinAlphabetDict() {
+        InputStream in = PinyinAlphabetDict.class.getResourceAsStream(fileName);
+        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+        try {
+            String line;
+            while (null != (line = reader.readLine())) {
+                if (line.trim().length() > 0) {
+                    alphabet.add(line);
+                }
+            }
+        } catch (Exception ex) {
+            throw new RuntimeException("read pinyin dic error.", ex);
+        } finally {
+            try {
+                reader.close();
+            } catch (Exception ignored) {
+            }
+        }
+    }
+
+    public static PinyinAlphabetDict getInstance() {
+        if (instance == null) {
+            synchronized (PinyinAlphabetDict.class) {
+                if (instance == null) {
+                    instance = new PinyinAlphabetDict();
+                }
+            }
+        }
+        return instance;
+    }
+
+    public boolean match(String c) {
+        return alphabet.contains(c);
+    }
+}

+ 26 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzer.java

@@ -0,0 +1,26 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: Medcl'
+ * Date: 12-5-22
+ * Time: 上午10:39
+ */
+public final class PinyinAnalyzer extends Analyzer {
+
+    private PinyinConfig config;
+
+    public PinyinAnalyzer(PinyinConfig config) {
+        this.config=config;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+            return new TokenStreamComponents(new PinyinTokenizer(config));
+    }
+
+}

+ 27 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinAnalyzerProvider.java

@@ -0,0 +1,27 @@
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.analysis.PinyinConfig;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+/**
+ */
+public class PinyinAnalyzerProvider extends AbstractIndexAnalyzerProvider<PinyinAnalyzer> {
+
+    private final PinyinAnalyzer analyzer;
+    private PinyinConfig config;
+
+    @Inject
+    public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        config=new PinyinConfig(settings);
+        analyzer = new PinyinAnalyzer(config);
+    }
+
+    @Override
+    public PinyinAnalyzer get() {
+        return this.analyzer;
+    }
+}

+ 336 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java

@@ -0,0 +1,336 @@
+package org.elasticsearch.index.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.nlpcn.commons.lang.pinyin.Pinyin;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+
+public class PinyinTokenFilter extends TokenFilter {
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private boolean done = true;
+    private boolean processedCandidate = false;
+    private boolean processedFullPinyinLetter = false;
+    private boolean processedFirstLetter = false;
+    private boolean processedOriginal = false;
+    private boolean processedSortCandidate = false;
+    protected int position = 0;
+    protected int lastOffset = 0;
+    private PinyinConfig config;
+    List<TermItem> candidate;
+    private HashSet<String> termsFilter;
+
+    protected int candidateOffset = 0;
+    StringBuilder firstLetters;
+    StringBuilder fullPinyinLetters;
+    String source;
+    private int lastIncrementPosition = 0;
+
+    private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
+
+    public PinyinTokenFilter(TokenStream in, PinyinConfig config) {
+        super(in);
+        this.config = config;
+        //validate config
+        if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin || config.keepSeparateChinese)) {
+            throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
+        }
+        candidate = new ArrayList<>();
+        firstLetters = new StringBuilder();
+        termsFilter = new HashSet<>();
+        fullPinyinLetters = new StringBuilder();
+    }
+
+    //TODO refactor, merge code
+    @Override
+    public final boolean incrementToken() throws IOException {
+
+
+        if (!done) {
+            if (readTerm()) return true;
+        }
+
+        if (done) {
+            resetVariable();
+            if (!input.incrementToken()) {
+                return false;
+            }
+            done = false;
+        }
+        readTerm();
+        return true;
+    }
+
+    private boolean readTerm() {
+        if (!processedCandidate) {
+            processedCandidate = true;
+            lastOffset = termAtt.length();
+            source = termAtt.toString();
+            if (config.trimWhitespace) {
+                source = source.trim();
+            }
+
+            List<String> pinyinList = Pinyin.pinyin(source);
+            List<String> chineseList = ChineseUtil.segmentChinese(source);
+            if (pinyinList.size() == 0 || chineseList.size() == 0) return false;
+
+            StringBuilder buff = new StringBuilder();
+            int buffStartPosition = 0;
+            int buffSize = 0;
+            position = 0;
+
+            for (int i = 0; i < source.length(); i++) {
+                char c = source.charAt(i);
+
+                //keep original alphabet
+                if (c < 128) {
+                    if (buff.length() <= 0) {
+                        buffStartPosition = i;
+                    }
+                    if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
+                        if (config.keepNoneChinese) {
+                            if (config.keepNoneChinese) {
+                                if (config.keepNoneChineseTogether) {
+                                    buff.append(c);
+                                    buffSize++;
+                                } else {
+                                    addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition));
+                                }
+                            }
+                        }
+                        if (config.keepNoneChineseInFirstLetter) {
+                            firstLetters.append(c);
+                        }
+                        if (config.keepNoneChineseInJoinedFullPinyin) {
+                            fullPinyinLetters.append(c);
+                        }
+                    }
+                } else {
+                    //clean previous temp
+                    if (buff.length() > 0) {
+                        buffSize = parseBuff(buff, buffSize, buffStartPosition);
+                    }
+
+                    String pinyin = pinyinList.get(i);
+                    String chinese = chineseList.get(i);
+                    if (pinyin != null && pinyin.length() > 0) {
+                        position++;
+                        firstLetters.append(pinyin.charAt(0));
+                        if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
+                            addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
+                        }
+                        if (config.keepFullPinyin) {
+                            addCandidate(new TermItem(pinyin, i, i + 1, position));
+                        }
+                        if(config.keepSeparateChinese){
+                            addCandidate(new TermItem(chinese, i, i + 1, position));
+                        }
+                        if (config.keepJoinedFullPinyin) {
+                            fullPinyinLetters.append(pinyin);
+                        }
+                    }
+                }
+
+                lastOffset = i;
+
+            }
+
+            //clean previous temp
+            if (buff.length() > 0) {
+                buffSize = parseBuff(buff, buffSize, buffStartPosition);
+            }
+        }
+
+
+        if (config.keepOriginal && !processedOriginal) {
+            processedOriginal = true;
+            addCandidate(new TermItem(source, 0, source.length(), 1));
+        }
+
+        if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) {
+            processedFullPinyinLetter = true;
+            addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1));
+            fullPinyinLetters.setLength(0);
+        }
+
+
+        if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) {
+            processedFirstLetter = true;
+            String fl;
+            if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
+                fl = firstLetters.substring(0, config.LimitFirstLetterLength);
+            } else {
+                fl = firstLetters.toString();
+            }
+            if (config.lowercase) {
+                fl = fl.toLowerCase();
+            }
+            if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
+                addCandidate(new TermItem(fl, 0, fl.length(), 1));
+            }
+        }
+
+        if (!processedSortCandidate) {
+            processedSortCandidate = true;
+            Collections.sort(candidate);
+        }
+
+        if (candidateOffset < candidate.size()) {
+            TermItem item = candidate.get(candidateOffset);
+            candidateOffset++;
+            setTerm(item.term, item.startOffset, item.endOffset, item.position);
+            return true;
+        }
+
+        done = true;
+        return false;
+    }
+
+
+    void addCandidate(TermItem item) {
+
+        String term = item.term;
+        if (config.lowercase) {
+            term = term.toLowerCase();
+        }
+
+        if (config.trimWhitespace) {
+            term = term.trim();
+        }
+        item.term = term;
+
+        if (term.length() == 0) {
+            return;
+        }
+
+        //remove same term with same position
+        String fr=term+item.position;
+
+        //remove same term, regardless position
+        if (config.removeDuplicateTerm) {
+            fr=term;
+        }
+
+        if (termsFilter.contains(fr)) {
+            return;
+        }
+        termsFilter.add(fr);
+
+        candidate.add(item);
+    }
+
+
+    void setTerm(String term, int startOffset, int endOffset, int position) {
+        if (config.lowercase) {
+            term = term.toLowerCase();
+        }
+
+        if (config.trimWhitespace) {
+            term = term.trim();
+        }
+
+        //ignore empty term
+        if(term.length()==0){
+            return;
+        }
+
+        termAtt.setEmpty();
+        termAtt.append(term);
+        if (startOffset < 0) {
+            startOffset = 0;
+        }
+        if (endOffset < startOffset) {
+            endOffset = startOffset + term.length();
+        }
+
+        int offset = position - lastIncrementPosition;
+        if (offset < 0) {
+            offset = 0;
+        }
+        positionAttr.setPositionIncrement(offset);
+
+        lastIncrementPosition = position;
+    }
+
+    private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
+        if (config.keepNoneChinese) {
+            if (config.noneChinesePinyinTokenize) {
+                List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
+                int start = (lastOffset - buffSize + 1);
+                for (int i = 0; i < result.size(); i++) {
+                    int end;
+                    String t = result.get(i);
+                    if (config.fixedPinyinOffset) {
+                        end = start + 1;
+                    } else {
+                        end = start + t.length();
+                    }
+                    addCandidate(new TermItem(result.get(i), start, end, ++position));
+                    start = end;
+                }
+            } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
+                addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, ++position));
+            }
+        }
+
+        buff.setLength(0);
+        buffSize = 0;
+        return buffSize;
+    }
+
+    @Override
+    public final void end() throws IOException {
+        super.end();
+    }
+
+    void resetVariable() {
+        position = 0;
+        lastOffset = 0;
+        candidate.clear();
+        this.processedCandidate = false;
+        this.processedFirstLetter = false;
+        this.processedFullPinyinLetter = false;
+        this.processedOriginal = false;
+        firstLetters.setLength(0);
+        fullPinyinLetters.setLength(0);
+        source = null;
+        candidateOffset = 0;
+        termsFilter.clear();
+        lastIncrementPosition = 0;
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        this.done = true;
+        resetVariable();
+    }
+
+
+}

+ 23 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilterFactory.java

@@ -0,0 +1,23 @@
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class PinyinTokenFilterFactory extends AbstractTokenFilterFactory {
+    private PinyinConfig config;
+
+
+    public PinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+       config=new PinyinConfig(settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new PinyinTokenFilter(tokenStream, config);
+    }
+}

+ 336 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java

@@ -0,0 +1,336 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.nlpcn.commons.lang.pinyin.Pinyin;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+
+
+public class PinyinTokenizer extends Tokenizer {
+
+
+    private static final int DEFAULT_BUFFER_SIZE = 256;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private boolean done = false;
+    private boolean processedCandidate = false;
+    private boolean processedSortCandidate = false;
+    private boolean processedFirstLetter = false;
+    private boolean processedFullPinyinLetter = false;
+    private boolean processedOriginal = false;
+    protected int position = 0;
+    protected int lastOffset = 0;
+    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
+    private PinyinConfig config;
+    ArrayList<TermItem> candidate;
+    protected int candidateOffset = 0; //indicate candidates process offset
+    private HashSet<String> termsFilter;
+    StringBuilder firstLetters;
+    StringBuilder fullPinyinLetters;
+
+    private int lastIncrementPosition = 0;
+
+    String source;
+
+    public PinyinTokenizer(PinyinConfig config) {
+        this(DEFAULT_BUFFER_SIZE);
+        this.config = config;
+
+        //validate config
+        if (!(config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin || config.keepSeparateChinese)) {
+            throw new ConfigErrorException("pinyin config error, can't disable separate_first_letter, first_letter and full_pinyin at the same time.");
+        }
+        candidate = new ArrayList<>();
+        termsFilter = new HashSet<>();
+        firstLetters = new StringBuilder();
+        fullPinyinLetters = new StringBuilder();
+    }
+
+    public PinyinTokenizer(int bufferSize) {
+        super();
+        termAtt.resizeBuffer(bufferSize);
+    }
+
+    void addCandidate(TermItem item) {
+
+        String term = item.term;
+        if (config.lowercase) {
+            term = term.toLowerCase();
+        }
+
+        if (config.trimWhitespace) {
+            term = term.trim();
+        }
+        item.term = term;
+
+        if (term.length() == 0) {
+            return;
+        }
+
+        //remove same term with same position
+        String fr = term + item.position;
+
+        //remove same term, regardless position
+        if (config.removeDuplicateTerm) {
+            fr = term;
+        }
+
+        if (termsFilter.contains(fr)) {
+            return;
+        }
+        termsFilter.add(fr);
+
+        candidate.add(item);
+    }
+
+
+    void setTerm(String term, int startOffset, int endOffset, int position) {
+        if (config.lowercase) {
+            term = term.toLowerCase();
+        }
+
+        if (config.trimWhitespace) {
+            term = term.trim();
+        }
+
+        //ignore empty term
+        if (term.length() == 0) {
+            return;
+        }
+
+        termAtt.setEmpty();
+        termAtt.append(term);
+        if (startOffset < 0) {
+            startOffset = 0;
+        }
+        if (endOffset < startOffset) {
+            endOffset = startOffset + term.length();
+        }
+
+        if (!config.ignorePinyinOffset) {
+            offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
+        }
+
+        int offset = position - lastIncrementPosition;
+        if (offset < 0) {
+            offset = 0;
+        }
+        positionAttr.setPositionIncrement(offset);
+
+        lastIncrementPosition = position;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+
+        clearAttributes();
+
+        if (!done) {
+
+            //combine text together to get right pinyin
+            if (!processedCandidate) {
+                processedCandidate = true;
+                int upto = 0;
+                char[] buffer = termAtt.buffer();
+                while (true) {
+                    final int length = input.read(buffer, upto, buffer.length - upto);
+                    if (length == -1) break;
+                    upto += length;
+                    if (upto == buffer.length)
+                        buffer = termAtt.resizeBuffer(1 + buffer.length);
+                }
+                termAtt.setLength(upto);
+                source = termAtt.toString();
+
+                List<String> pinyinList = Pinyin.pinyin(source);
+                List<String> chineseList = ChineseUtil.segmentChinese(source);
+                if (pinyinList.size() == 0 || chineseList.size() == 0) return false;
+
+                StringBuilder buff = new StringBuilder();
+                int buffStartPosition = 0;
+                int buffSize = 0;
+
+                position = 0;
+
+                for (int i = 0; i < source.length(); i++) {
+
+                    char c = source.charAt(i);
+                    //keep original alphabet
+                    if (c < 128) {
+                        if (buff.length() <= 0) {
+                            buffStartPosition = i;
+                        }
+                        if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
+                            if (config.keepNoneChinese) {
+                                if (config.keepNoneChineseTogether) {
+                                    buff.append(c);
+                                    buffSize++;
+                                } else {
+                                    position++;
+                                    addCandidate(new TermItem(String.valueOf(c), i, i + 1, buffStartPosition + 1));
+                                }
+                            }
+                            if (config.keepNoneChineseInFirstLetter) {
+                                firstLetters.append(c);
+                            }
+                            if (config.keepNoneChineseInJoinedFullPinyin) {
+                                fullPinyinLetters.append(c);
+                            }
+                        }
+                    } else {
+
+                        //clean previous temp
+                        if (buff.length() > 0) {
+                            buffSize = parseBuff(buff, buffSize, buffStartPosition);
+                        }
+
+                        boolean incrPosition = false;
+
+                        String pinyin = pinyinList.get(i);
+                        String chinese = chineseList.get(i);
+                        if (pinyin != null && pinyin.length() > 0) {
+                            firstLetters.append(pinyin.charAt(0));
+                            if (config.keepSeparateFirstLetter & pinyin.length() > 1) {
+                                position++;
+                                incrPosition = true;
+                                addCandidate(new TermItem(String.valueOf(pinyin.charAt(0)), i, i + 1, position));
+                            }
+                            if (config.keepFullPinyin) {
+                                if (!incrPosition) {
+                                    position++;
+                                }
+                                addCandidate(new TermItem(pinyin, i, i + 1, position));
+                            }
+                            if(config.keepSeparateChinese){
+                                addCandidate(new TermItem(chinese, i, i + 1, position));
+                            }
+                            if (config.keepJoinedFullPinyin) {
+                                fullPinyinLetters.append(pinyin);
+                            }
+                        }
+                    }
+
+                    lastOffset = i;
+
+                }
+
+                //clean previous temp
+                if (buff.length() > 0) {
+                    buffSize = parseBuff(buff, buffSize, buffStartPosition);
+                }
+            }
+
+            if (config.keepOriginal && !processedOriginal) {
+                processedOriginal = true;
+                addCandidate(new TermItem(source, 0, source.length(), 1));
+            }
+
+            if (config.keepJoinedFullPinyin && !processedFullPinyinLetter && fullPinyinLetters.length() > 0) {
+                processedFullPinyinLetter = true;
+                addCandidate(new TermItem(fullPinyinLetters.toString(), 0, source.length(), 1));
+                fullPinyinLetters.setLength(0);
+            }
+
+
+            if (config.keepFirstLetter && firstLetters.length() > 0 && !processedFirstLetter) {
+                processedFirstLetter = true;
+                String fl;
+                if (firstLetters.length() > config.LimitFirstLetterLength && config.LimitFirstLetterLength > 0) {
+                    fl = firstLetters.substring(0, config.LimitFirstLetterLength);
+                } else {
+                    fl = firstLetters.toString();
+                }
+                if (config.lowercase) {
+                    fl = fl.toLowerCase();
+                }
+                if (!(config.keepSeparateFirstLetter && fl.length() <= 1)) {
+                    addCandidate(new TermItem(fl, 0, fl.length(), 1));
+                }
+            }
+
+            if (!processedSortCandidate) {
+                processedSortCandidate = true;
+                Collections.sort(candidate);
+            }
+
+            if (candidateOffset < candidate.size()) {
+                TermItem item = candidate.get(candidateOffset);
+                candidateOffset++;
+                setTerm(item.term, item.startOffset, item.endOffset, item.position);
+                return true;
+            }
+
+
+            done = true;
+            return false;
+        }
+        return false;
+    }
+
+    private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
+        if (config.keepNoneChinese) {
+            if (config.noneChinesePinyinTokenize) {
+                List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
+                int start = (lastOffset - buffSize + 1);
+                for (int i = 0; i < result.size(); i++) {
+                    int end;
+                    String t = result.get(i);
+                    if (config.fixedPinyinOffset) {
+                        end = start + 1;
+                    } else {
+                        end = start + t.length();
+                    }
+                    position++;
+                    addCandidate(new TermItem(result.get(i), start, end, position));
+                    start = end;
+                }
+            } else if (config.keepFirstLetter || config.keepSeparateFirstLetter || config.keepFullPinyin || !config.keepNoneChineseInJoinedFullPinyin) {
+                position++;
+                addCandidate(new TermItem(buff.toString(), lastOffset - buffSize, lastOffset, position));
+            }
+        }
+
+        buff.setLength(0);
+        buffSize = 0;
+        return buffSize;
+    }
+
+    @Override
+    public final void end() throws IOException {
+        super.end();
+        if (!config.ignorePinyinOffset) {
+            ++lastOffset;
+            offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(lastOffset));
+        }
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        position = 0;
+        candidateOffset = 0;
+        this.done = false;
+        this.processedCandidate = false;
+        this.processedFirstLetter = false;
+        this.processedFullPinyinLetter = false;
+        this.processedOriginal = false;
+        this.processedSortCandidate = false;
+        firstLetters.setLength(0);
+        fullPinyinLetters.setLength(0);
+        termsFilter.clear();
+        candidate.clear();
+        source = null;
+        lastIncrementPosition = 0;
+        lastOffset = 0;
+    }
+
+
+}

+ 23 - 0
src/main/java/org/elasticsearch/index/analysis/PinyinTokenizerFactory.java

@@ -0,0 +1,23 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.analysis.PinyinConfig;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class PinyinTokenizerFactory extends AbstractTokenizerFactory {
+
+    private PinyinConfig config;
+
+    public PinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, settings, name);
+        config=new PinyinConfig(settings);
+    }
+
+    @Override
+    public Tokenizer create() {
+            return new PinyinTokenizer(config);
+    }
+}
+

+ 31 - 0
src/main/java/org/elasticsearch/index/analysis/TermItem.java

@@ -0,0 +1,31 @@
+package org.elasticsearch.index.analysis;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: Medcl'
+ * Date: 12-5-21
+ * Time: 下午5:53
+ */
+
+public class TermItem implements Comparable<TermItem>{
+    String term;
+    int startOffset;
+    int endOffset;
+    int position;
+    public TermItem(String term,int startOffset,int endOffset,int position){
+        this.term=term;
+        this.startOffset=startOffset;
+        this.endOffset=endOffset;
+        this.position=position;
+    }
+
+    @Override
+    public String toString() {
+        return term;
+    }
+
+    @Override
+    public int compareTo(TermItem o) {
+        return this.position-o.position;
+    }
+}

+ 36 - 0
src/main/java/org/elasticsearch/plugin/analysis/pinyin/AnalysisPinyinPlugin.java

@@ -0,0 +1,36 @@
+package org.elasticsearch.plugin.analysis.pinyin;
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.index.analysis.*;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+
+public class AnalysisPinyinPlugin extends Plugin implements AnalysisPlugin {
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
+        extra.put("pinyin", PinyinTokenizerFactory::new);
+        extra.put("pinyin_first_letter", PinyinAbbreviationsTokenizerFactory::new);
+        return extra;
+    }
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<org.elasticsearch.index.analysis.TokenFilterFactory>> getTokenFilters() {
+        Map<String, AnalysisModule.AnalysisProvider<org.elasticsearch.index.analysis.TokenFilterFactory>> extra = new HashMap<>();
+        extra.put("pinyin", PinyinTokenFilterFactory::new);
+        return extra;
+    }
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
+        return Collections.singletonMap("pinyin", PinyinAnalyzerProvider::new);
+    }
+}

+ 440 - 0
src/main/resources/pinyin_alphabet.dict

@@ -0,0 +1,440 @@
+a
+ai
+an
+ang
+ao
+b
+ba
+bai
+ban
+bang
+bao
+bei
+ben
+beng
+bi
+bian
+biao
+bie
+bin
+bing
+bo
+bu
+c
+ca
+cai
+can
+cang
+cao
+ce
+cen
+ceng
+ch
+cha
+chai
+chan
+chang
+chao
+che
+chen
+cheng
+chi
+chong
+chou
+chu
+chua
+chuai
+chuan
+chuang
+chui
+chun
+chuo
+ci
+cong
+cou
+cu
+cuan
+cui
+cun
+cuo
+d
+da
+dai
+dan
+dang
+dao
+de
+dei
+den
+deng
+di
+dia
+dian
+diao
+die
+ding
+diu
+dong
+dou
+du
+duan
+dui
+dun
+duo
+e
+er
+f
+fa
+fan
+fang
+fei
+fen
+feng
+fiao
+fo
+fou
+fu
+g
+ga
+gai
+gan
+gang
+gao
+ge
+gei
+gen
+geng
+gong
+gou
+gu
+gua
+guai
+guan
+guang
+gui
+gun
+guo
+h
+ha
+hai
+han
+hang
+hao
+he
+hei
+hen
+heng
+hong
+hou
+hu
+hua
+huai
+huan
+huang
+hui
+hun
+huo
+i
+j
+ja
+ji
+jia
+jian
+jiang
+jiao
+jie
+jin
+jing
+jiong
+jiu
+ju
+juan
+jue
+jun
+k
+ka
+kai
+kan
+kang
+kao
+ke
+kei
+ken
+keng
+kong
+kou
+ku
+kua
+kuai
+kuan
+kuang
+kui
+kun
+kuo
+l
+la
+lai
+lan
+lang
+lao
+le
+lei
+leng
+li
+lia
+lian
+liang
+liao
+lie
+lin
+ling
+liu
+lo
+long
+lou
+lu
+luan
+lun
+luo
+lv
+lve
+lü
+lüe
+m
+ma
+mai
+man
+mang
+mao
+me
+mei
+men
+meng
+mi
+mian
+miao
+mie
+min
+ming
+miu
+mo
+mou
+mu
+n
+na
+nai
+nan
+nang
+nao
+ne
+nei
+nen
+neng
+ni
+nian
+niang
+niao
+nie
+nin
+ning
+niu
+nong
+nou
+nu
+nuan
+nun
+nuo
+nv
+nve
+nü
+nüe
+o
+p
+pa
+pai
+pan
+pang
+pao
+pei
+pen
+peng
+pi
+pian
+piao
+pie
+pin
+ping
+po
+pou
+pu
+q
+qi
+qia
+qian
+qiang
+qiao
+qie
+qin
+qing
+qiong
+qiu
+qu
+quan
+que
+qun
+r
+ran
+rang
+rao
+re
+ren
+reng
+ri
+rong
+rou
+ru
+ruan
+rui
+run
+ruo
+s
+sa
+sai
+san
+sang
+sao
+se
+sen
+seng
+sh
+sha
+shai
+shan
+shang
+shao
+she
+shei
+shen
+sheng
+shi
+shou
+shu
+shua
+shuai
+shuan
+shuang
+shui
+shun
+shuo
+si
+song
+sou
+su
+suan
+sui
+sun
+suo
+t
+ta
+tai
+tan
+tang
+tao
+te
+teng
+ti
+tian
+tiao
+tie
+ting
+tong
+tou
+tu
+tuan
+tui
+tun
+tuo
+u
+v
+w
+wa
+wai
+wan
+wang
+wei
+wen
+weng
+wo
+wu
+x
+xi
+xia
+xian
+xiang
+xiao
+xie
+xin
+xing
+xiong
+xiu
+xu
+xuan
+xue
+xun
+y
+ya
+yai
+yan
+yang
+yao
+ye
+yi
+yin
+ying
+yo
+yong
+you
+yu
+yuan
+yue
+yun
+z
+za
+zai
+zan
+zang
+zao
+ze
+zei
+zen
+zeng
+zh
+zha
+zhai
+zhan
+zhang
+zhao
+zhe
+zhei
+zhen
+zheng
+zhi
+zhong
+zhou
+zhu
+zhua
+zhuai
+zhuan
+zhuang
+zhui
+zhun
+zhuo
+zi
+zong
+zou
+zu
+zuan
+zui
+zun
+zuo

+ 57 - 0
src/main/resources/plugin-descriptor.properties

@@ -0,0 +1,57 @@
+# Elasticsearch plugin descriptor file
+# This file must exist as 'plugin-descriptor.properties' at
+# the root directory of all plugins.
+#
+# A plugin can be 'site', 'jvm', or both.
+#
+### example site plugin for "foo":
+#
+# foo.zip <-- zip file for the plugin, with this structure:
+#   _site/ <-- the contents that will be served
+#   plugin-descriptor.properties <-- example contents below:
+#
+# site=true
+# description=My cool plugin
+# version=1.0
+#
+### example jvm plugin for "foo"
+#
+# foo.zip <-- zip file for the plugin, with this structure:
+#   <arbitrary name1>.jar <-- classes, resources, dependencies
+#   <arbitrary nameN>.jar <-- any number of jars
+#   plugin-descriptor.properties <-- example contents below:
+#
+# jvm=true
+# classname=foo.bar.BazPlugin
+# description=My cool plugin
+# version=2.0.0-rc1
+# elasticsearch.version=2.0
+# java.version=1.7
+#
+### mandatory elements for all plugins:
+#
+# 'description': simple summary of the plugin
+description=${project.description}
+#
+# 'version': plugin's version
+version=${project.version}
+#
+# 'name': the plugin name
+name=${elasticsearch.plugin.name}
+
+#
+# 'classname': the name of the class to load, fully-qualified.
+classname=${elasticsearch.plugin.classname}
+#
+# 'java.version' version of java the code is built against
+# use the system property java.specification.version
+# version string must be a sequence of nonnegative decimal integers
+# separated by "."'s and may have leading zeros
+java.version=${maven.compiler.target}
+#
+# 'elasticsearch.version' version of elasticsearch compiled against
+# You will have to release a new version of the plugin for each new
+# elasticsearch release. This version is checked when the plugin
+# is loaded so Elasticsearch will refuse to start in the presence of
+# plugins with the incorrect elasticsearch.version.
+elasticsearch.version=${elasticsearch.version}

+ 30 - 0
src/test/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizerTest.java

@@ -0,0 +1,30 @@
+package org.elasticsearch.index.analysis;
+
+import java.util.Arrays;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * 拼音串切分,很难做到最好,认为取最少切分是最好的
+ *
+ * @author shenyanchao
+ * @since 2018-10-08 12:22
+ */
+public class PinyinAlphabetTokenizerTest {
+
+    @Test
+    public void walk() throws Exception {
+
+        Assert.assertEquals(Arrays.asList("xian").toString(), PinyinAlphabetTokenizer.walk("xian").toString());
+        Assert.assertEquals(Arrays.asList("wo", "shi", "liang").toString(),
+                PinyinAlphabetTokenizer.walk("woshiliang").toString());
+
+        Assert.assertEquals(Arrays.asList("zhong", "hua", "ren", "min", "gong", "he", "guo").toString(),
+                PinyinAlphabetTokenizer.walk("zhonghuarenmingongheguo").toString());
+        Assert.assertEquals(
+                Arrays.asList("5", "zhong", "hua", "ren", "89", "min", "gong", "he", "guo", "234").toString(),
+                PinyinAlphabetTokenizer.walk("5zhonghuaren89mingongheguo234").toString());
+    }
+
+}

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 1529 - 0
src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTest.java


+ 5 - 0
src/test/resources/log4j.properties

@@ -0,0 +1,5 @@
+log4j.rootLogger=INFO, out
+
+log4j.appender.out=org.apache.log4j.ConsoleAppender
+log4j.appender.out.layout=org.apache.log4j.PatternLayout
+log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n