代码语言:javascript复制
"Set the shape to semi-transparent by calling set_trans(5)"
标准分词器(默认使用)
代码语言:javascript复制分词结果:
set, the, shape, to, semi, transparent, by, calling, set_trans, 5
POST _analyze
{
"analyzer": "standard",
"text": "Like X 国庆放假的"
}
{
"tokens" : [
{
"token" : "like",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "x",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "国",
"start_offset" : 7,
"end_offset" : 8,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "庆",
"start_offset" : 8,
"end_offset" : 9,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "放",
"start_offset" : 9,
"end_offset" : 10,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "假",
"start_offset" : 10,
"end_offset" : 11,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "的",
"start_offset" : 11,
"end_offset" : 12,
"type" : "<IDEOGRAPHIC>",
"position" : 6
}
]
}
简单分析器
简单分析器在任何不是字母的地方分隔文本,将词条小写
代码语言:javascript复制结果
set, the, shape, to, semi, transparent, by, calling, set, trans
POST _analyze
{
"analyzer": "simple",
"text": "Like X 国庆放假 的"
}
{
"tokens" : [
{
"token" : "like",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "x",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 1
},
{
"token" : "国庆放假的",
"start_offset" : 7,
"end_offset" : 12,
"type" : "word",
"position" : 2
}
]
}
空格分析器
空格分析器在空格的地方划分文本
代码语言:javascript复制结果:
Set, the, shape, to, semi-transparent, by, calling, set_trans(5)
POST _analyze
{
"analyzer": "whitespace",
"text": "Like X 国庆放假 的"
}
{
"tokens" : [
{
"token" : "Like",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 0
},
{
"token" : "X",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 1
},
{
"token" : "国庆放假的",
"start_offset" : 7,
"end_offset" : 12,
"type" : "word",
"position" : 2
}
]
}
语言分析器
特定语言分析器可用于 很多语言
Shingles:unigram、bigrams、trigrams
unigram
bigrams
代码语言:javascript复制新建索引setting:
PUT /test_index
{
"settings":{
"number_of_shards":1,
"analysis":{
"analyzer":{
"bigrams_analyzer_1":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"bigrams_filter_1"
]
},
"bigrams_analyzer_2":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"bigrams_filter_2"
]
}
},
"filter":{
"bigrams_filter_1":{
"type":"shingle",
"min_shingle_size":2,#默认最小/最大的 shingle 大小是 2 ,所以实际上不需要设置。
"max_shingle_size":2,#默认最小/最大的 shingle 大小是 2 ,所以实际上不需要设置。
"output_unigrams":true
},
"bigrams_filter_2":{
"type":"shingle",
"min_shingle_size":2,
"max_shingle_size":2,
"output_unigrams":false#shingle语汇单元过滤器默认输出 unigrams ,但是我们想让unigrams 和 bigrams 分开。
}
}
}
}
}
PUT /test_index/_mapping/_doc
{
"_doc": {
"properties": {
"name1": {
"type": "text",
"analyzer": "bigrams_analyzer_1"
},
"name2": {
"type": "text",
"analyzer": "bigrams_analyzer_2"
}
}
}
}
PUT test_index/_doc/1
{
"name1":"北京协和医院",
"name2":"北京协和医院"
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name1",
"text":"北京协和医院"
}
结果:
{
"tokens" : [
{
"token" : "北",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "北 京",
"start_offset" : 0,
"end_offset" : 2,
"type" : "shingle",
"position" : 0,
"positionLength" : 2
},
{
"token" : "京",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "京 协",
"start_offset" : 1,
"end_offset" : 3,
"type" : "shingle",
"position" : 1,
"positionLength" : 2
},
{
"token" : "协",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "协 和",
"start_offset" : 2,
"end_offset" : 4,
"type" : "shingle",
"position" : 2,
"positionLength" : 2
},
{
"token" : "和",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "和 医",
"start_offset" : 3,
"end_offset" : 5,
"type" : "shingle",
"position" : 3,
"positionLength" : 2
},
{
"token" : "医",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "医 院",
"start_offset" : 4,
"end_offset" : 6,
"type" : "shingle",
"position" : 4,
"positionLength" : 2
},
{
"token" : "院",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
}
]
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name2",
"text":"北京协和医院"
}
结果:
{
"tokens" : [
{
"token" : "北 京",
"start_offset" : 0,
"end_offset" : 2,
"type" : "shingle",
"position" : 0
},
{
"token" : "京 协",
"start_offset" : 1,
"end_offset" : 3,
"type" : "shingle",
"position" : 1
},
{
"token" : "协 和",
"start_offset" : 2,
"end_offset" : 4,
"type" : "shingle",
"position" : 2
},
{
"token" : "和 医",
"start_offset" : 3,
"end_offset" : 5,
"type" : "shingle",
"position" : 3
},
{
"token" : "医 院",
"start_offset" : 4,
"end_offset" : 6,
"type" : "shingle",
"position" : 4
}
]
}
trigrams
代码语言:javascript复制新建索引setting:
PUT /test_index
{
"settings":{
"number_of_shards":1,
"analysis":{
"analyzer":{
"trigram_analyzer_1":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"trigram_filter_1"
]
},
"trigram_analyzer_2":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"trigram_filter_2"
]
},
"trigram_analyzer_3":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"trigram_filter_3"
]
},
"trigram_analyzer_4":{
"type":"custom",
"tokenizer":"standard",
"filter":[
"lowercase",
"trigram_filter_4"
]
}
},
"filter":{
"trigram_filter_1":{
"max_shingle_size":"3",
"min_shingle_size":"2",
"output_unigrams":true,
"type":"shingle"
},
"trigram_filter_2":{
"max_shingle_size":"3",
"min_shingle_size":"3",
"output_unigrams":true,
"type":"shingle"
},
"trigram_filter_3":{
"max_shingle_size":"3",
"min_shingle_size":"2",
"output_unigrams":false,
"type":"shingle"
},
"trigram_filter_4":{
"max_shingle_size":"3",
"min_shingle_size":"3",
"output_unigrams":false,
"type":"shingle"
}
}
}
}
}
PUT /test_index/_mapping/_doc
{
"_doc":{
"properties":{
"name3":{
"type":"text",
"analyzer":"trigram_analyzer_1"
},
"name4":{
"type":"text",
"analyzer":"trigram_analyzer_2"
},
"name5":{
"type":"text",
"analyzer":"trigram_analyzer_3"
},
"name6":{
"type":"text",
"analyzer":"trigram_analyzer_4"
}
}
}
}
PUT test_index/_doc/1
{
"name3":"北京协和医院",
"name4":"北京协和医院",
"name5":"北京协和医院",
"name6":"北京协和医院"
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name3",
"text":"北京协和医院"
}
{
"tokens" : [
{
"token" : "北",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "北 京",
"start_offset" : 0,
"end_offset" : 2,
"type" : "shingle",
"position" : 0,
"positionLength" : 2
},
{
"token" : "北 京 协",
"start_offset" : 0,
"end_offset" : 3,
"type" : "shingle",
"position" : 0,
"positionLength" : 3
},
{
"token" : "京",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "京 协",
"start_offset" : 1,
"end_offset" : 3,
"type" : "shingle",
"position" : 1,
"positionLength" : 2
},
{
"token" : "京 协 和",
"start_offset" : 1,
"end_offset" : 4,
"type" : "shingle",
"position" : 1,
"positionLength" : 3
},
{
"token" : "协",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "协 和",
"start_offset" : 2,
"end_offset" : 4,
"type" : "shingle",
"position" : 2,
"positionLength" : 2
},
{
"token" : "协 和 医",
"start_offset" : 2,
"end_offset" : 5,
"type" : "shingle",
"position" : 2,
"positionLength" : 3
},
{
"token" : "和",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "和 医",
"start_offset" : 3,
"end_offset" : 5,
"type" : "shingle",
"position" : 3,
"positionLength" : 2
},
{
"token" : "和 医 院",
"start_offset" : 3,
"end_offset" : 6,
"type" : "shingle",
"position" : 3,
"positionLength" : 3
},
{
"token" : "医",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "医 院",
"start_offset" : 4,
"end_offset" : 6,
"type" : "shingle",
"position" : 4,
"positionLength" : 2
},
{
"token" : "院",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
}
]
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name5",
"text":"北京协和医院"
}
{
"tokens" : [
{
"token" : "北 京",
"start_offset" : 0,
"end_offset" : 2,
"type" : "shingle",
"position" : 0
},
{
"token" : "北 京 协",
"start_offset" : 0,
"end_offset" : 3,
"type" : "shingle",
"position" : 0,
"positionLength" : 2
},
{
"token" : "京 协",
"start_offset" : 1,
"end_offset" : 3,
"type" : "shingle",
"position" : 1
},
{
"token" : "京 协 和",
"start_offset" : 1,
"end_offset" : 4,
"type" : "shingle",
"position" : 1,
"positionLength" : 2
},
{
"token" : "协 和",
"start_offset" : 2,
"end_offset" : 4,
"type" : "shingle",
"position" : 2
},
{
"token" : "协 和 医",
"start_offset" : 2,
"end_offset" : 5,
"type" : "shingle",
"position" : 2,
"positionLength" : 2
},
{
"token" : "和 医",
"start_offset" : 3,
"end_offset" : 5,
"type" : "shingle",
"position" : 3
},
{
"token" : "和 医 院",
"start_offset" : 3,
"end_offset" : 6,
"type" : "shingle",
"position" : 3,
"positionLength" : 2
},
{
"token" : "医 院",
"start_offset" : 4,
"end_offset" : 6,
"type" : "shingle",
"position" : 4
}
]
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name4",
"text":"北京协和医院"
}
{
"tokens" : [
{
"token" : "北",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "北 京 协",
"start_offset" : 0,
"end_offset" : 3,
"type" : "shingle",
"position" : 0,
"positionLength" : 3
},
{
"token" : "京",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "京 协 和",
"start_offset" : 1,
"end_offset" : 4,
"type" : "shingle",
"position" : 1,
"positionLength" : 3
},
{
"token" : "协",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "协 和 医",
"start_offset" : 2,
"end_offset" : 5,
"type" : "shingle",
"position" : 2,
"positionLength" : 3
},
{
"token" : "和",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "和 医 院",
"start_offset" : 3,
"end_offset" : 6,
"type" : "shingle",
"position" : 3,
"positionLength" : 3
},
{
"token" : "医",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "院",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
}
]
}
代码语言:javascript复制POST test_index/_analyze
{
"field":"name6",
"text":"北京协和医院"
}
{
"tokens" : [
{
"token" : "北 京 协",
"start_offset" : 0,
"end_offset" : 3,
"type" : "shingle",
"position" : 0
},
{
"token" : "京 协 和",
"start_offset" : 1,
"end_offset" : 4,
"type" : "shingle",
"position" : 1
},
{
"token" : "协 和 医",
"start_offset" : 2,
"end_offset" : 5,
"type" : "shingle",
"position" : 2
},
{
"token" : "和 医 院",
"start_offset" : 3,
"end_offset" : 6,
"type" : "shingle",
"position" : 3
}
]
}
逗号分词器
代码语言:javascript复制{
"analysis":{
"analyzer":{
"comma":{
"type":"pattern",
"pattern":","
}
}
}
}
POST my_index/_analyze
{
"analyzer": "comma",
"text": "20200911,20200918,20200925"
}
N-Gram&Edge-Ngram
代码语言:javascript复制PUT /test_index
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"lowercase_ngram": {
"tokenizer": "ngram_tokenizer",
"filter": "lowercase"
},
"lowercase_edge-ngram": {
"tokenizer": "edge-ngram_tokenizer",
"filter": "lowercase"
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "1",
"max_gram": "2",
"token_chars": [
"letter",
"digit"
]
},
"edge-ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "10",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
}
PUT /test_index/_mapping/_doc
{
"_doc":{
"properties":{
"name1":{
"type":"text",
"analyzer":"lowercase_ngram"
},
"name2":{
"type":"text",
"analyzer":"lowercase_edge-ngram"
}
}
}
}
POST test_index/_analyze
{
"field":"name1",
"text":"北京协和医院"
}
POST test_index/_analyze
{
"field":"name2",
"text":"北京协和医院"
}
1-2 gram:北 北京 京 京协.....
边界-gram:北 北京 北京协 北京协和 北京协和医.....