From 59db95897796e153b4a44b8d7c689c45b4824939 Mon Sep 17 00:00:00 2001
From: Anna-Maria Meriniemi <The.Actual.AMM@gmail.com>
Date: Sun, 29 Apr 2018 04:09:40 +0300
Subject: [PATCH] ES: delimit words before ngram, optimize tokens (#487)

Before, long.tokens.with.dots.or.dashes would get edgengrammed up to the
ngram limit, so we'd get to long.tokens.wit which would then be split -
discarding "with.dots.or.dashes" completely. The fullword index would
keep the complete large token, but without any ngramming, so incomplete
searches (like "tokens") would not match it, only the full token.

Now, we split words before ngramming them, so the main index will
properly handle words up to the ngram limit. The fullword index will
still handle the longer words for non-ngram matching.

Also optimized away duplicate tokens from the indices (since we rely on
boolean matching, not scoring) to save a couple megabytes of space.
---
 es_mapping.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/es_mapping.yml b/es_mapping.yml
index e064ce1..1086fb2 100644
--- a/es_mapping.yml
+++ b/es_mapping.yml
@@ -20,9 +20,10 @@ settings:
         filter:
           - resolution
           - lowercase
-          - my_ngram
           - word_delimit
+          - my_ngram
           - trim_zero
+          - unique
       # For exact matching - simple lowercase + whitespace delimiter
       exact_analyzer:
         tokenizer: whitespace
@@ -40,6 +41,7 @@ settings:
           # Skip tokens shorter than N characters,
           # since they're already indexed in the main field
           - fullword_min
+          - unique
 
     filter:
       my_ngram: