非常感谢rendel帮助我找到了正确的解决方案！

Andrei Stefan的解决方案不是最佳的。

为什么？首先，搜索分析器中缺少小写过滤器，搜索不便；情况必须严格匹配。

lowercase

需要使用带过滤器的自定义分析器，而不是

"analyzer":"keyword"

。

其次， 分析部分是错误的 ！在索引时间内，使用字符串分析“ 早期发作的阿尔茨海默病中的F00.0-痴呆 ”

edge_ngram_analyzer

。使用此分析器，我们具有以下字典数组作为被分析的字符串：

{  "tokens": [    {      "end_offset": 2,       "token": "f0",       "type": "word",       "start_offset": 0,       "position": 0    },     {      "end_offset": 3,       "token": "f00",       "type": "word",       "start_offset": 0,       "position": 1    },     {      "end_offset": 6,       "token": "0 ",       "type": "word",       "start_offset": 4,       "position": 2    },     {      "end_offset": 9,       "token": "  ",       "type": "word",       "start_offset": 7,       "position": 3    },     {      "end_offset": 10,       "token": "  d",       "type": "word",       "start_offset": 7,       "position": 4    },     {      "end_offset": 11,       "token": "  de",       "type": "word",       "start_offset": 7,       "position": 5    },     {      "end_offset": 12,       "token": "  dem",       "type": "word",       "start_offset": 7,       "position": 6    },     {      "end_offset": 13,       "token": "  deme",       "type": "word",       "start_offset": 7,       "position": 7    },     {      "end_offset": 14,       "token": "  demen",       "type": "word",       "start_offset": 7,       "position": 8    },     {      "end_offset": 15,       "token": "  dement",       "type": "word",       "start_offset": 7,       "position": 9    },     {      "end_offset": 16,       "token": "  dementi",       "type": "word",       "start_offset": 7,       "position": 10    },     {      "end_offset": 17,       "token": "  dementia",       "type": "word",       "start_offset": 7,       "position": 11    },     {      "end_offset": 18,       "token": "  dementia ",       "type": "word",       "start_offset": 7,       "position": 12    },     {      "end_offset": 19,       "token": "  dementia i",       "type": "word",       "start_offset": 7,       "position": 13    },     {      "end_offset": 20,       "token": "  dementia in",       "type": "word",       "start_offset": 7,       "position": 14    },     {      "end_offset": 21,       "token": "  dementia in ",       "type": "word",       "start_offset": 7,       "position": 15    },     {      "end_offset": 22,       "token": "  dementia in a",       "type": "word",       "start_offset": 7,       "position": 16    },     {      "end_offset": 23,       "token": "  dementia in al",       "type": "word",       "start_offset": 7,       "position": 17    },     {      "end_offset": 24,       "token": "  dementia in alz",       "type": "word",       "start_offset": 7,       "position": 18    },     {      "end_offset": 25,       "token": "  dementia in alzh",       "type": "word",       "start_offset": 7,       "position": 19    },     {      "end_offset": 26,       "token": "  dementia in alzhe",       "type": "word",       "start_offset": 7,       "position": 20    },     {      "end_offset": 27,       "token": "  dementia in alzhei",       "type": "word",       "start_offset": 7,       "position": 21    },     {      "end_offset": 28,       "token": "  dementia in alzheim",       "type": "word",       "start_offset": 7,       "position": 22    },     {      "end_offset": 29,       "token": "  dementia in alzheime",       "type": "word",       "start_offset": 7,       "position": 23    },     {      "end_offset": 30,       "token": "  dementia in alzheimer",       "type": "word",       "start_offset": 7,       "position": 24    },     {      "end_offset": 33,       "token": "s ",       "type": "word",       "start_offset": 31,       "position": 25    },     {      "end_offset": 34,       "token": "s d",       "type": "word",       "start_offset": 31,       "position": 26    },     {      "end_offset": 35,       "token": "s di",       "type": "word",       "start_offset": 31,       "position": 27    },     {      "end_offset": 36,       "token": "s dis",       "type": "word",       "start_offset": 31,       "position": 28    },     {      "end_offset": 37,       "token": "s dise",       "type": "word",       "start_offset": 31,       "position": 29    },     {      "end_offset": 38,       "token": "s disea",       "type": "word",       "start_offset": 31,       "position": 30    },     {      "end_offset": 39,       "token": "s diseas",       "type": "word",       "start_offset": 31,       "position": 31    },     {      "end_offset": 40,       "token": "s disease",       "type": "word",       "start_offset": 31,       "position": 32    },     {      "end_offset": 41,       "token": "s disease ",       "type": "word",       "start_offset": 31,       "position": 33    },     {      "end_offset": 42,       "token": "s disease w",       "type": "word",       "start_offset": 31,       "position": 34    },     {      "end_offset": 43,       "token": "s disease wi",       "type": "word",       "start_offset": 31,       "position": 35    },     {      "end_offset": 44,       "token": "s disease wit",       "type": "word",       "start_offset": 31,       "position": 36    },     {      "end_offset": 45,       "token": "s disease with",       "type": "word",       "start_offset": 31,       "position": 37    },     {      "end_offset": 46,       "token": "s disease with ",       "type": "word",       "start_offset": 31,       "position": 38    },     {      "end_offset": 47,       "token": "s disease with e",       "type": "word",       "start_offset": 31,       "position": 39    },     {      "end_offset": 48,       "token": "s disease with ea",       "type": "word",       "start_offset": 31,       "position": 40    },     {      "end_offset": 49,       "token": "s disease with ear",       "type": "word",       "start_offset": 31,       "position": 41    },     {      "end_offset": 50,       "token": "s disease with earl",       "type": "word",       "start_offset": 31,       "position": 42    },     {      "end_offset": 51,       "token": "s disease with early",       "type": "word",       "start_offset": 31,       "position": 43    },     {      "end_offset": 52,       "token": "s disease with early ",       "type": "word",       "start_offset": 31,       "position": 44    },     {      "end_offset": 53,       "token": "s disease with early o",       "type": "word",       "start_offset": 31,       "position": 45    },     {      "end_offset": 54,       "token": "s disease with early on",       "type": "word",       "start_offset": 31,       "position": 46    },     {      "end_offset": 55,       "token": "s disease with early ons",       "type": "word",       "start_offset": 31,       "position": 47    },     {      "end_offset": 56,       "token": "s disease with early onse",       "type": "word",       "start_offset": 31,       "position": 48    }  ]}

如您所见，整个字符串使用2到25个字符的标记大小标记。字符串以线性方式标记，所有新标记的所有空格和位置都加一。

它有几个问题：

的
```
edge_ngram_analyzer
```
产生 无用的令牌 将永远不会被搜索的，例如：“ 0 ”，“”，“ d ”，“ SD ”，“ 病瓦特 ”等
而且， 它并没有产生 很多 有用的标记 ，例如：“ 疾病 ”，“ 早期发作 ”等。如果您尝试搜索这些词中的任何一个，将会有0个结果。
注意，最后一个标记是“ 早病 ”。最后的“ t ” 在哪里？由于
```
"max_gram" : "25"
```
我们“ 丢失 ”了所有字段中的某些文本。您无法再搜索该文本，因为没有标记。
该
```
trim
```
过滤器仅混淆过滤多余的空格时，它可以通过一个标记来实现的问题。
的
```
edge_ngram_analyzer
```
增量是用于位置的查询，如短语查询问题的每个标记的位置。人们应该使用
```
edge_ngram_filter
```
替代方法，在生成ngram时将保留令牌 的位置 。

最佳解决方案。

要使用的映射设置：

..."mappings": {    "Type": {       "_all":{          "analyzer": "edge_ngram_analyzer","search_analyzer": "keyword_analyzer"        },         "properties": {          "Field": { "search_analyzer": "keyword_analyzer",  "type": "string",  "analyzer": "edge_ngram_analyzer"          },......"settings": {   "analysis": {      "filter": {         "english_poss_stemmer": { "type": "stemmer", "name": "possessive_english"         },         "edge_ngram": {"type": "edgeNGram","min_gram": "2","max_gram": "25","token_chars": ["letter", "digit"]         }      },      "analyzer": {         "edge_ngram_analyzer": {"filter": ["lowercase", "english_poss_stemmer", "edge_ngram"],"tokenizer": "standard"         },         "keyword_analyzer": {"filter": ["lowercase", "english_poss_stemmer"],"tokenizer": "standard"         }      }   }}...

看一下分析：

{  "tokens": [    {      "end_offset": 5,       "token": "f0",       "type": "word",       "start_offset": 0,       "position": 0    },     {      "end_offset": 5,       "token": "f00",       "type": "word",       "start_offset": 0,       "position": 0    },     {      "end_offset": 5,       "token": "f00.",       "type": "word",       "start_offset": 0,       "position": 0    },     {      "end_offset": 5,       "token": "f00.0",       "type": "word",       "start_offset": 0,       "position": 0    },     {      "end_offset": 17,       "token": "de",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "dem",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "deme",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "demen",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "dement",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "dementi",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 17,       "token": "dementia",       "type": "word",       "start_offset": 9,       "position": 2    },     {      "end_offset": 20,       "token": "in",       "type": "word",       "start_offset": 18,       "position": 3    },     {      "end_offset": 32,       "token": "al",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alz",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzh",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzhe",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzhei",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzheim",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzheime",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 32,       "token": "alzheimer",       "type": "word",       "start_offset": 21,       "position": 4    },     {      "end_offset": 40,       "token": "di",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 40,       "token": "dis",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 40,       "token": "dise",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 40,       "token": "disea",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 40,       "token": "diseas",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 40,       "token": "disease",       "type": "word",       "start_offset": 33,       "position": 5    },     {      "end_offset": 45,       "token": "wi",       "type": "word",       "start_offset": 41,       "position": 6    },     {      "end_offset": 45,       "token": "wit",       "type": "word",       "start_offset": 41,       "position": 6    },     {      "end_offset": 45,       "token": "with",       "type": "word",       "start_offset": 41,       "position": 6    },     {      "end_offset": 51,       "token": "ea",       "type": "word",       "start_offset": 46,       "position": 7    },     {      "end_offset": 51,       "token": "ear",       "type": "word",       "start_offset": 46,       "position": 7    },     {      "end_offset": 51,       "token": "earl",       "type": "word",       "start_offset": 46,       "position": 7    },     {      "end_offset": 51,       "token": "early",       "type": "word",       "start_offset": 46,       "position": 7    },     {      "end_offset": 57,       "token": "on",       "type": "word",       "start_offset": 52,       "position": 8    },     {      "end_offset": 57,       "token": "ons",       "type": "word",       "start_offset": 52,       "position": 8    },     {      "end_offset": 57,       "token": "onse",       "type": "word",       "start_offset": 52,       "position": 8    },     {      "end_offset": 57,       "token": "onset",       "type": "word",       "start_offset": 52,       "position": 8    }  ]}

On index time a text is tokenized by

standard

tokenizer, then separate words
are filtered by

lowercase

possessive_english

and

edge_ngram

filters.
Tokens are produced only for words. On search time a text is tokenized by

standard

tokenizer, then separate words are filtered by

lowercase

and

possessive_english

. The searched words are matched against the tokens which
had been created during the index time.

Thus we make the incremental search possible!

Now, because we do ngram on separate words, we can even execute queries like

{  'query': {    'multi_match': {      'query': 'dem in alzh',        'type': 'phrase',       'fields': ['_all']    }  }}

and get correct results.

No text is “lost”, everything is searchable and there is no need to deal with
spaces by

trim

filter anymore.

具有词组匹配功能的Edge NGram

最佳解决方案。

面试问答相关栏目本月热门文章