Dictionary.java 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. /**
  2. * IK 中文分词 版本 5.0
  3. * IK Analyzer release 5.0
  4. *
  5. * Licensed to the Apache Software Foundation (ASF) under one or more
  6. * contributor license agreements. See the NOTICE file distributed with
  7. * this work for additional information regarding copyright ownership.
  8. * The ASF licenses this file to You under the Apache License, Version 2.0
  9. * (the "License"); you may not use this file except in compliance with
  10. * the License. You may obtain a copy of the License at
  11. *
  12. * http://www.apache.org/licenses/LICENSE-2.0
  13. *
  14. * Unless required by applicable law or agreed to in writing, software
  15. * distributed under the License is distributed on an "AS IS" BASIS,
  16. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17. * See the License for the specific language governing permissions and
  18. * limitations under the License.
  19. *
  20. * 源代码由林良益(linliangyi2005@gmail.com)提供
  21. * 版权声明 2012,乌龙茶工作室
  22. * provided by Linliangyi and copyright 2012 by Oolong studio
  23. *
  24. *
  25. */
  26. package org.wltea.analyzer.dic;
  27. import java.io.BufferedReader;
  28. import java.io.FileInputStream;
  29. import java.io.FileNotFoundException;
  30. import java.io.IOException;
  31. import java.io.InputStream;
  32. import java.io.InputStreamReader;
  33. import java.nio.file.attribute.BasicFileAttributes;
  34. import java.nio.file.Files;
  35. import java.nio.file.FileVisitResult;
  36. import java.nio.file.Path;
  37. import java.nio.file.SimpleFileVisitor;
  38. import java.security.AccessController;
  39. import java.security.PrivilegedAction;
  40. import java.util.*;
  41. import java.util.concurrent.Executors;
  42. import java.util.concurrent.ScheduledExecutorService;
  43. import java.util.concurrent.TimeUnit;
  44. import org.apache.http.Header;
  45. import org.apache.http.HttpEntity;
  46. import org.apache.http.client.ClientProtocolException;
  47. import org.apache.http.client.config.RequestConfig;
  48. import org.apache.http.client.methods.CloseableHttpResponse;
  49. import org.apache.http.client.methods.HttpGet;
  50. import org.apache.http.impl.client.CloseableHttpClient;
  51. import org.apache.http.impl.client.HttpClients;
  52. import org.elasticsearch.SpecialPermission;
  53. import org.elasticsearch.core.PathUtils;
  54. import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
  55. import org.wltea.analyzer.cfg.Configuration;
  56. import org.apache.logging.log4j.Logger;
  57. import org.wltea.analyzer.help.ESPluginLoggerFactory;
  58. /**
  59. * 词典管理类,单子模式
  60. */
  61. public class Dictionary {
  62. /*
  63. * 词典单子实例
  64. */
  65. private static Dictionary singleton;
  66. private DictSegment _MainDict;
  67. private DictSegment _QuantifierDict;
  68. private DictSegment _StopWords;
  69. /**
  70. * 配置对象
  71. */
  72. private Configuration configuration;
  73. private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
  74. private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
  75. private static final String PATH_DIC_MAIN = "main.dic";
  76. private static final String PATH_DIC_SURNAME = "surname.dic";
  77. private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
  78. private static final String PATH_DIC_SUFFIX = "suffix.dic";
  79. private static final String PATH_DIC_PREP = "preposition.dic";
  80. private static final String PATH_DIC_STOP = "stopword.dic";
  81. private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
  82. private final static String EXT_DICT = "ext_dict";
  83. private final static String REMOTE_EXT_DICT = "remote_ext_dict";
  84. private final static String EXT_STOP = "ext_stopwords";
  85. private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
  86. private Path conf_dir;
  87. private Properties props;
  88. private Dictionary(Configuration cfg) {
  89. this.configuration = cfg;
  90. this.props = new Properties();
  91. this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
  92. Path configFile = conf_dir.resolve(FILE_NAME);
  93. InputStream input = null;
  94. try {
  95. logger.info("try load config from {}", configFile);
  96. input = new FileInputStream(configFile.toFile());
  97. } catch (FileNotFoundException e) {
  98. conf_dir = cfg.getConfigInPluginDir();
  99. configFile = conf_dir.resolve(FILE_NAME);
  100. try {
  101. logger.info("try load config from {}", configFile);
  102. input = new FileInputStream(configFile.toFile());
  103. } catch (FileNotFoundException ex) {
  104. // We should report origin exception
  105. logger.error("ik-analyzer", e);
  106. }
  107. }
  108. if (input != null) {
  109. try {
  110. props.loadFromXML(input);
  111. } catch (IOException e) {
  112. logger.error("ik-analyzer", e);
  113. }
  114. }
  115. }
  116. public String getProperty(String key){
  117. if(props!=null){
  118. return props.getProperty(key);
  119. }
  120. return null;
  121. }
  122. /**
  123. * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
  124. * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
  125. *
  126. * @return Dictionary
  127. */
  128. public static synchronized void initial(Configuration cfg) {
  129. if (singleton == null) {
  130. synchronized (Dictionary.class) {
  131. if (singleton == null) {
  132. singleton = new Dictionary(cfg);
  133. singleton.loadMainDict();
  134. singleton.loadSurnameDict();
  135. singleton.loadQuantifierDict();
  136. singleton.loadSuffixDict();
  137. singleton.loadPrepDict();
  138. singleton.loadStopWordDict();
  139. if(cfg.isEnableRemoteDict()){
  140. // 建立监控线程
  141. for (String location : singleton.getRemoteExtDictionarys()) {
  142. // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
  143. pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
  144. }
  145. for (String location : singleton.getRemoteExtStopWordDictionarys()) {
  146. pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
  147. }
  148. //建立数据库监控线程
  149. pool.scheduleAtFixedRate(new DatabaseMonitor(),10,120,TimeUnit.SECONDS);
  150. }
  151. }
  152. }
  153. }
  154. }
  155. private void walkFileTree(List<String> files, Path path) {
  156. if (Files.isRegularFile(path)) {
  157. files.add(path.toString());
  158. } else if (Files.isDirectory(path)) try {
  159. Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
  160. @Override
  161. public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
  162. files.add(file.toString());
  163. return FileVisitResult.CONTINUE;
  164. }
  165. @Override
  166. public FileVisitResult visitFileFailed(Path file, IOException e) {
  167. logger.error("[Ext Loading] listing files", e);
  168. return FileVisitResult.CONTINUE;
  169. }
  170. });
  171. } catch (IOException e) {
  172. logger.error("[Ext Loading] listing files", e);
  173. } else {
  174. logger.warn("[Ext Loading] file not found: " + path);
  175. }
  176. }
  177. private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
  178. try (InputStream is = new FileInputStream(file.toFile())) {
  179. BufferedReader br = new BufferedReader(
  180. new InputStreamReader(is, "UTF-8"), 512);
  181. String word = br.readLine();
  182. if (word != null) {
  183. if (word.startsWith("\uFEFF"))
  184. word = word.substring(1);
  185. for (; word != null; word = br.readLine()) {
  186. word = word.trim();
  187. if (word.isEmpty()) continue;
  188. dict.fillSegment(word.toCharArray());
  189. }
  190. }
  191. } catch (FileNotFoundException e) {
  192. logger.error("ik-analyzer: " + name + " not found", e);
  193. if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
  194. } catch (IOException e) {
  195. logger.error("ik-analyzer: " + name + " loading failed", e);
  196. }
  197. }
  198. private List<String> getExtDictionarys() {
  199. List<String> extDictFiles = new ArrayList<String>(2);
  200. String extDictCfg = getProperty(EXT_DICT);
  201. if (extDictCfg != null) {
  202. String[] filePaths = extDictCfg.split(";");
  203. for (String filePath : filePaths) {
  204. if (filePath != null && !"".equals(filePath.trim())) {
  205. Path file = PathUtils.get(getDictRoot(), filePath.trim());
  206. walkFileTree(extDictFiles, file);
  207. }
  208. }
  209. }
  210. return extDictFiles;
  211. }
  212. private List<String> getRemoteExtDictionarys() {
  213. List<String> remoteExtDictFiles = new ArrayList<String>(2);
  214. String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
  215. if (remoteExtDictCfg != null) {
  216. String[] filePaths = remoteExtDictCfg.split(";");
  217. for (String filePath : filePaths) {
  218. if (filePath != null && !"".equals(filePath.trim())) {
  219. remoteExtDictFiles.add(filePath);
  220. }
  221. }
  222. }
  223. return remoteExtDictFiles;
  224. }
  225. private List<String> getExtStopWordDictionarys() {
  226. List<String> extStopWordDictFiles = new ArrayList<String>(2);
  227. String extStopWordDictCfg = getProperty(EXT_STOP);
  228. if (extStopWordDictCfg != null) {
  229. String[] filePaths = extStopWordDictCfg.split(";");
  230. for (String filePath : filePaths) {
  231. if (filePath != null && !"".equals(filePath.trim())) {
  232. Path file = PathUtils.get(getDictRoot(), filePath.trim());
  233. walkFileTree(extStopWordDictFiles, file);
  234. }
  235. }
  236. }
  237. return extStopWordDictFiles;
  238. }
  239. private List<String> getRemoteExtStopWordDictionarys() {
  240. List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
  241. String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
  242. if (remoteExtStopWordDictCfg != null) {
  243. String[] filePaths = remoteExtStopWordDictCfg.split(";");
  244. for (String filePath : filePaths) {
  245. if (filePath != null && !"".equals(filePath.trim())) {
  246. remoteExtStopWordDictFiles.add(filePath);
  247. }
  248. }
  249. }
  250. return remoteExtStopWordDictFiles;
  251. }
  252. private String getDictRoot() {
  253. return conf_dir.toAbsolutePath().toString();
  254. }
  255. /**
  256. * 获取词典单子实例
  257. *
  258. * @return Dictionary 单例对象
  259. */
  260. public static Dictionary getSingleton() {
  261. if (singleton == null) {
  262. throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
  263. }
  264. return singleton;
  265. }
  266. /**
  267. * 批量加载新停顿词条
  268. *
  269. * @param words
  270. * Collection<String>词条列表
  271. */
  272. public void addStopWords(Collection<String> words) {
  273. if (words != null) {
  274. for (String word : words) {
  275. if (word != null) {
  276. // 批量加载词条到主内存词典中
  277. singleton._StopWords.fillSegment(word.trim().toCharArray());
  278. }
  279. }
  280. }
  281. }
  282. /**
  283. * 批量移除(屏蔽)停顿词条
  284. */
  285. public void disableStopWords(Collection<String> words) {
  286. if (words != null) {
  287. for (String word : words) {
  288. if (word != null) {
  289. // 批量屏蔽词条
  290. singleton._StopWords.disableSegment(word.trim().toCharArray());
  291. }
  292. }
  293. }
  294. }
  295. /**
  296. * 批量加载新词条
  297. *
  298. * @param words
  299. * Collection<String>词条列表
  300. */
  301. public void addMainWords(Collection<String> words) {
  302. if (words != null) {
  303. for (String word : words) {
  304. if (word != null) {
  305. // 批量加载词条到主内存词典中
  306. singleton._MainDict.fillSegment(word.trim().toCharArray());
  307. }
  308. }
  309. }
  310. }
  311. /**
  312. * 批量移除(屏蔽)词条
  313. */
  314. public void disableMainWords(Collection<String> words) {
  315. if (words != null) {
  316. for (String word : words) {
  317. if (word != null) {
  318. // 批量屏蔽词条
  319. singleton._MainDict.disableSegment(word.trim().toCharArray());
  320. }
  321. }
  322. }
  323. }
  324. /**
  325. * 批量加载新词条
  326. *
  327. * @param words
  328. * Collection<String>词条列表
  329. */
  330. public void addWords(Collection<String> words) {
  331. if (words != null) {
  332. for (String word : words) {
  333. if (word != null) {
  334. // 批量加载词条到主内存词典中
  335. singleton._MainDict.fillSegment(word.trim().toCharArray());
  336. }
  337. }
  338. }
  339. }
  340. /**
  341. * 批量移除(屏蔽)词条
  342. */
  343. public void disableWords(Collection<String> words) {
  344. if (words != null) {
  345. for (String word : words) {
  346. if (word != null) {
  347. // 批量屏蔽词条
  348. singleton._MainDict.disableSegment(word.trim().toCharArray());
  349. }
  350. }
  351. }
  352. }
  353. /**
  354. * 检索匹配主词典
  355. *
  356. * @return Hit 匹配结果描述
  357. */
  358. public Hit matchInMainDict(char[] charArray) {
  359. return singleton._MainDict.match(charArray);
  360. }
  361. /**
  362. * 检索匹配主词典
  363. *
  364. * @return Hit 匹配结果描述
  365. */
  366. public Hit matchInMainDict(char[] charArray, int begin, int length) {
  367. return singleton._MainDict.match(charArray, begin, length);
  368. }
  369. /**
  370. * 检索匹配量词词典
  371. *
  372. * @return Hit 匹配结果描述
  373. */
  374. public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
  375. return singleton._QuantifierDict.match(charArray, begin, length);
  376. }
  377. /**
  378. * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
  379. *
  380. * @return Hit
  381. */
  382. public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
  383. DictSegment ds = matchedHit.getMatchedDictSegment();
  384. return ds.match(charArray, currentIndex, 1, matchedHit);
  385. }
  386. /**
  387. * 判断是否是停止词
  388. *
  389. * @return boolean
  390. */
  391. public boolean isStopWord(char[] charArray, int begin, int length) {
  392. return singleton._StopWords.match(charArray, begin, length).isMatch();
  393. }
  394. /**
  395. * 加载主词典及扩展词典
  396. */
  397. private void loadMainDict() {
  398. // 建立一个主词典实例
  399. _MainDict = new DictSegment((char) 0);
  400. // 读取主词典文件
  401. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
  402. loadDictFile(_MainDict, file, false, "Main Dict");
  403. // 加载扩展词典
  404. this.loadExtDict();
  405. // 加载远程自定义词库
  406. this.loadRemoteExtDict();
  407. }
  408. /**
  409. * 加载用户配置的扩展词典到主词库表
  410. */
  411. private void loadExtDict() {
  412. // 加载扩展词典配置
  413. List<String> extDictFiles = getExtDictionarys();
  414. if (extDictFiles != null) {
  415. for (String extDictName : extDictFiles) {
  416. // 读取扩展词典文件
  417. logger.info("[Dict Loading] " + extDictName);
  418. Path file = PathUtils.get(extDictName);
  419. loadDictFile(_MainDict, file, false, "Extra Dict");
  420. }
  421. }
  422. }
  423. /**
  424. * 加载远程扩展词典到主词库表
  425. */
  426. private void loadRemoteExtDict() {
  427. List<String> remoteExtDictFiles = getRemoteExtDictionarys();
  428. for (String location : remoteExtDictFiles) {
  429. logger.info("[Dict Loading] " + location);
  430. List<String> lists = getRemoteWords(location);
  431. // 如果找不到扩展的字典,则忽略
  432. if (lists == null) {
  433. logger.error("[Dict Loading] " + location + " load failed");
  434. continue;
  435. }
  436. for (String theWord : lists) {
  437. if (theWord != null && !"".equals(theWord.trim())) {
  438. // 加载扩展词典数据到主内存词典中
  439. logger.info(theWord);
  440. _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
  441. }
  442. }
  443. }
  444. }
  445. private static List<String> getRemoteWords(String location) {
  446. SpecialPermission.check();
  447. return AccessController.doPrivileged((PrivilegedAction<List<String>>) () -> {
  448. return getRemoteWordsUnprivileged(location);
  449. });
  450. }
  451. /**
  452. * 从远程服务器上下载自定义词条
  453. */
  454. private static List<String> getRemoteWordsUnprivileged(String location) {
  455. List<String> buffer = new ArrayList<String>();
  456. RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
  457. .setSocketTimeout(60 * 1000).build();
  458. CloseableHttpClient httpclient = HttpClients.createDefault();
  459. CloseableHttpResponse response;
  460. BufferedReader in;
  461. HttpGet get = new HttpGet(location);
  462. get.setConfig(rc);
  463. try {
  464. response = httpclient.execute(get);
  465. if (response.getStatusLine().getStatusCode() == 200) {
  466. String charset = "UTF-8";
  467. // 获取编码,默认为utf-8
  468. HttpEntity entity = response.getEntity();
  469. if(entity!=null){
  470. Header contentType = entity.getContentType();
  471. if(contentType!=null&&contentType.getValue()!=null){
  472. String typeValue = contentType.getValue();
  473. if(typeValue!=null&&typeValue.contains("charset=")){
  474. charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
  475. }
  476. }
  477. if (entity.getContentLength() > 0 || entity.isChunked()) {
  478. in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
  479. String line;
  480. while ((line = in.readLine()) != null) {
  481. buffer.add(line);
  482. }
  483. in.close();
  484. response.close();
  485. return buffer;
  486. }
  487. }
  488. }
  489. response.close();
  490. } catch (IllegalStateException | IOException e) {
  491. logger.error("getRemoteWords {} error", e, location);
  492. }
  493. return buffer;
  494. }
  495. /**
  496. * 加载用户扩展的停止词词典
  497. */
  498. private void loadStopWordDict() {
  499. // 建立主词典实例
  500. _StopWords = new DictSegment((char) 0);
  501. // 读取主词典文件
  502. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
  503. loadDictFile(_StopWords, file, false, "Main Stopwords");
  504. // 加载扩展停止词典
  505. List<String> extStopWordDictFiles = getExtStopWordDictionarys();
  506. if (extStopWordDictFiles != null) {
  507. for (String extStopWordDictName : extStopWordDictFiles) {
  508. logger.info("[Dict Loading] " + extStopWordDictName);
  509. // 读取扩展词典文件
  510. file = PathUtils.get(extStopWordDictName);
  511. loadDictFile(_StopWords, file, false, "Extra Stopwords");
  512. }
  513. }
  514. // 加载远程停用词典
  515. List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
  516. for (String location : remoteExtStopWordDictFiles) {
  517. logger.info("[Dict Loading] " + location);
  518. List<String> lists = getRemoteWords(location);
  519. // 如果找不到扩展的字典,则忽略
  520. if (lists == null) {
  521. logger.error("[Dict Loading] " + location + " load failed");
  522. continue;
  523. }
  524. for (String theWord : lists) {
  525. if (theWord != null && !"".equals(theWord.trim())) {
  526. // 加载远程词典数据到主内存中
  527. logger.info(theWord);
  528. _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
  529. }
  530. }
  531. }
  532. }
  533. /**
  534. * 加载量词词典
  535. */
  536. private void loadQuantifierDict() {
  537. // 建立一个量词典实例
  538. _QuantifierDict = new DictSegment((char) 0);
  539. // 读取量词词典文件
  540. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
  541. loadDictFile(_QuantifierDict, file, false, "Quantifier");
  542. }
  543. private void loadSurnameDict() {
  544. DictSegment _SurnameDict = new DictSegment((char) 0);
  545. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
  546. loadDictFile(_SurnameDict, file, true, "Surname");
  547. }
  548. private void loadSuffixDict() {
  549. DictSegment _SuffixDict = new DictSegment((char) 0);
  550. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
  551. loadDictFile(_SuffixDict, file, true, "Suffix");
  552. }
  553. private void loadPrepDict() {
  554. DictSegment _PrepDict = new DictSegment((char) 0);
  555. Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
  556. loadDictFile(_PrepDict, file, true, "Preposition");
  557. }
  558. void reLoadMainDict() {
  559. logger.info("start to reload ik dict.");
  560. // 新开一个实例加载词典,减少加载过程对当前词典使用的影响
  561. Dictionary tmpDict = new Dictionary(configuration);
  562. tmpDict.configuration = getSingleton().configuration;
  563. tmpDict.loadMainDict();
  564. tmpDict.loadStopWordDict();
  565. _MainDict = tmpDict._MainDict;
  566. _StopWords = tmpDict._StopWords;
  567. logger.info("reload ik dict finished.");
  568. }
  569. }