Thursday, March 16, 2017

Elastic search: find word parts in nested object

Leave a Comment

I'm not able to find parts of a word in a nested object. Only the full word is found. My analyzer configuration is as follows:

{   "settings": {     "number_of_shards": 1,     "analysis": {       "filter": {         "word_part_filter": {           "type": "ngram",           "min_gram": 3,           "max_gram": 15         },         "word_part_front_filter": {           "type": "edgeNGram",           "min_gram": 2,           "max_gram": 15         },         "codeid_filter": {           "type": "pattern_replace",           "pattern": "[-/.:]",           "replacement": "",           "preserve_original": true         }       },       "char_filter": {         "umlaut_char_filter": {           "type": "mapping",           "mappings": [             "ö=>oe",             "ä=>ae",             "ü=>ue",             "�=>ss",             "�=>Oe",             "�=>Ae",             "�=>Ue"           ]         }       },       "analyzer": {         "description_analyser_query": {           "type": "custom",           "char_filter": [             "html_strip"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "stop",             "asciifolding"           ]         },         "description_analyser_idx": {           "type": "custom",           "char_filter": [             "html_strip"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "stop",             "asciifolding",             "word_part_filter"           ]         },         "name_analyser_query": {           "type": "custom",           "char_filter": [             "umlaut_char_filter"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "asciifolding"           ]         },         "name_analyser_idx": {           "type": "custom",           "char_filter": [             "umlaut_char_filter"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "asciifolding",             "word_part_filter"           ]         },         "codeid_analyser_query": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter"           ]         },         "codeid_analyser_idx_front": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter",             "word_part_front_filter"           ]         },         "codeid_analyser_idx_any": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter",             "word_part_filter"           ]         }       }     }   } } 

This is the nested object mapping (extracted):

{   "properties": {         "aid": {       "type": "nested",       "properties": {         "tpid": {           "type": "string",           "analyzer": "codeid_analyser_idx_any"         },         "aid": {           "type": "string",           "analyzer": "codeid_analyser_idx_any"         }       }         }   } } 

I'm searching with this query (extract). Only the "nested" part is essential here:

{   "query": {     "bool": {       "must": [         {           "bool": {             "should": [               {                 "nested": {                   "path": "aid",                   "query": {                     "bool": {                       "must": {                         "match": {                           "aid.aid": {                             "query": "1200",                             "analyzer": "codeid_analyser_query"                           }                         }                       },                       "filter": {                         "or": [                           {                             "match": {                               "aid.tpid": "buyer_specific"                             }                           },                           {                             "match": {                               "aid.tpid": "mytpid"                             }                           }                         ]                       }                     }                   }                 }               }             ],             "minimum_should_match": 1           }         }       ]     }   } } 

there is an element with aid=120000008

When using the analyzers in the fields, it finds nothing. When using no analyzers at all in the nested object mapping and query, only full words (like "120000008") will be found, but not "1200". Any ideas?

1 Answers

Answers 1

Actually, using ElasticSearch 5.2, using an index called test, and applying the mapping on a type called "product" (rewriting only the filter part so it is compliant to the evolution of the querying language), I obtain the correct result. The query:

GET test/_search {   "query": {     "bool": {       "must": [         {           "bool": {             "should": [               {                 "nested": {                   "path": "aid",                   "query": {                     "bool": {                       "must": {                         "match": {                           "aid.aid": {                             "query": "1200",                             "analyzer": "codeid_analyser_query"                           }                         }                       },                       "filter": {                         "terms": {                           "aid.tpid": [                             "mytpid",                             "buyer_specific"                           ]                         }                       }                     }                   }                 }               }             ],             "minimum_should_match": 1           }         }       ]     }   } } 

The index:

GET test/_search  {   "took": 8,   "timed_out": false,   "_shards": {     "total": 5,     "successful": 5,     "failed": 0   },   "hits": {     "total": 1,     "max_score": 1,     "hits": [       {         "_index": "test",         "_type": "product",         "_id": "AVrJ1CSd-NyeQ4r64kP6",         "_score": 1,         "_source": {           "aid": {             "aid": "120000008",             "tpid": "mytpid"           }         }       }     ]   } } 

The analyzer (I removed the umlaut filter as it was unreadable on my computer, and it does not change the result as in the tests it is not used):

PUT test {   "settings": {      "analysis": {       "filter": {         "word_part_filter": {           "type": "ngram",           "min_gram": 3,           "max_gram": 15         },         "word_part_front_filter": {           "type": "edgeNGram",           "min_gram": 2,           "max_gram": 15         },         "codeid_filter": {           "type": "pattern_replace",           "pattern": "[-/.:]",           "replacement": "",           "preserve_original": true         }       },        "analyzer": {         "description_analyser_query": {           "type": "custom",           "char_filter": [             "html_strip"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "stop",             "asciifolding"           ]         },         "description_analyser_idx": {           "type": "custom",           "char_filter": [             "html_strip"           ],           "tokenizer": "standard",           "filter": [             "lowercase",             "stop",             "asciifolding",             "word_part_filter"           ]         },          "codeid_analyser_query": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter"           ]         },         "codeid_analyser_idx_front": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter",             "word_part_front_filter"           ]         },         "codeid_analyser_idx_any": {           "type": "custom",           "tokenizer": "keyword",           "filter": [             "lowercase",             "codeid_filter",             "word_part_filter"           ]         }       }      }   } } 

The mapping on product:

PUT test/_mapping/product {    "properties": {         "aid": {       "type": "nested",       "properties": {         "tpid": {           "type": "string",           "analyzer": "codeid_analyser_idx_any"         },         "aid": {           "type": "string",           "analyzer": "codeid_analyser_idx_any"         }       }         }   } } 
If You Enjoyed This, Take 5 Seconds To Share It

0 comments:

Post a Comment