Faceted navigation for e-commerce with Elasticsearch

Mike Palei
6 min readJun 26, 2020

--

My colleague and I (the credit has to go to Dimitry Apter who’s done most of the actual tangible work) have recently been commissioned with a relatively simple task to create a multi-filter navigation for an e-commerce site. Since this feature is by no means unique, I was certain that it was going to be a 5 minute search in Google and Bob’s your uncle. I was bitterly disappointed. I was reading about filters aggregation, terms aggregation, nested aggregation, composite aggregation, on and on. It took almost a day to find the answer I really needed. Hopefully now people working on similar tasks will quickly stumble upon this post and find it useful.

The scenario is quite trivial. Suppose we have a shop selling clothes and our clothes have but 5 attributes: category, color, brand, style and size, i.e. 5 facets in Elasticsearch terms.

To make this post comprehensive let’s first synthesise some toy data.

import pandas as pd
import numpy as np
df = pd.DataFrame({'Category': np.random.choice(['Dress', 'Pants'], size=50, p=[0.7, 0.3]), 'Color': None, 'Style': None, 'Brand': None, 'Size': None})dress_styles = ['Maxi', 'Evening', 'Shift', 'Sheath']
dress_brands = ['Hermes', 'Prada', 'Chanel', 'Fendi', 'Armani']
sizes = ['S', 'M', 'L', 'XL']
pants_styles = ['Culottes', 'Tights', 'Dungarees']
pants_brands = ["Levi's", "Wrangler", "Armani", "Calvin Klein", "Diesel"]
colors = ['Green', 'Black', 'White', 'Red', 'Blue']
size = df[df.Category == 'Dress'].shape[0]
df.loc[df['Category'] == 'Dress', ['Style']] = np.random.choice(dress_styles, size=size).reshape(size,1)
df.loc[df['Category'] == 'Dress', ['Brand']] = np.random.choice(dress_brands, size=size).reshape(size,1)
size = df[df.Category == 'Pants'].shape[0]
df.loc[df['Category'] == 'Pants', ['Style']] = np.random.choice(pants_styles, size=size).reshape(size,1)
df.loc[df['Category'] == 'Pants', ['Brand']] = np.random.choice(pants_brands, size=size).reshape(size,1)
df['Color'] = np.random.choice(colors, size=50)
df['Size'] = np.random.choice(sizes, size=50)
df['id'] = list(range(1, len(df) + 1))

Now let’s insert our data to an Elasticsearch index (I was using ES 7.7)

from elasticsearch import Elasticsearch, helpers
def filterKeys(document, df):
return {key: document[key] for key in df.columns.values}
def doc_generator(df, index_name):
df_iter = df.iterrows()
for index, document in df_iter:
res = {
"_index": index_name,
"_id": f"{document['id']}",
"_source": filterKeys(document, df)
}
yield reses_client = Elasticsearch('localhost:9200')
index_name = 'faceted_navigation'
es_client.indices.create(index_name, body={"mappings": {"properties": {
"id": {
"type": "integer"
},
"Category": {
"type": "keyword"
},
"Color": {
"type": "keyword"
},
"Brand": {
"type": "keyword"
},
"Style": {
"type": "keyword"
},
"Size": {
"type": "keyword"
}

}}})
helpers.bulk(es_client, doc_generator(df, index_name), request_timeout=120)

Suppose I was searching for a dress. My query would look something like:

{
"size": 0,
"query": {
"match": {
"Category": "Dress"
}
},
"aggs": {
"Color": {
"terms": {
"field": "Color",
"size": 10
}
},
"Size": {
"terms": {
"field": "Size",
"size": 10
}
},
"Brand": {
"terms": {
"field": "Brand",
"size": 10
}
},
"Style": {
"terms": {
"field": "Style",
"size": 10
}
}
}
}

And the response would look something like:

{
"took": 33,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 30,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"Brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Prada",
"doc_count": 8
},
{
"key": "Chanel",
"doc_count": 6
},
{
"key": "Hermes",
"doc_count": 6
},
{
"key": "Armani",
"doc_count": 5
},
{
"key": "Fendi",
"doc_count": 5
}
]
},
"Size": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "L",
"doc_count": 9
},
{
"key": "XL",
"doc_count": 9
},
{
"key": "M",
"doc_count": 8
},
{
"key": "S",
"doc_count": 4
}
]
},
"Color": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Black",
"doc_count": 8
},
{
"key": "Red",
"doc_count": 8
},
{
"key": "White",
"doc_count": 7
},
{
"key": "Blue",
"doc_count": 4
},
{
"key": "Green",
"doc_count": 3
}
]
},
"Style": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Maxi",
"doc_count": 11
},
{
"key": "Sheath",
"doc_count": 11
},
{
"key": "Evening",
"doc_count": 5
},
{
"key": "Shift",
"doc_count": 3
}
]
}
}
}

Now suppose I applied some filters. I selected Prada evening dresses, in other words Brand — “Prada”, Style — “Evening”. The desired behaviour is:

  • Search results only contain Prada evening dresses
  • The counts for Brand are only affected by the Style filter
  • The counts of Style are only affected by the Brand filter
  • The counts for all other attributes are recalculated based on both Style and Brand filters

Apparently, what I was looking for was not advanced aggregations but a post_filter functionality. Thus, the query would look like:

{
"query": {
"match": {
"Category": "Dress"
}
},
"aggs": {
"Color": {
"aggs": {
"Color": {
"terms": {
"field": "Color",
"size": 10
}
}
},
"filter": {
"bool": {
"filter": [
{
"terms": {
"Brand": [
"Prada"
]
}
},
{
"terms": {
"Style": [
"Evening"
]
}
}
]
}
}
},
"Size": {
"aggs": {
"Size": {
"terms": {
"field": "Size",
"size": 5
}
}
},
"filter": {
"bool": {
"filter": [
{
"terms": {
"Brand": [
"Prada"
]
}
},
{
"terms": {
"Style": [
"Evening"
]
}
}
]
}
}
},
"Brand": {
"aggs": {
"Brand": {
"terms": {
"field": "Brand",
"size": 10
}
}
},
"filter": {
"bool": {
"filter": [
{
"terms": {
"Style": [
"Evening"
]
}
}
]
}
}
},
"Style": {
"aggs": {
"Style": {
"terms": {
"field": "Style",
"size": 10
}
}
},
"filter": {
"bool": {
"filter": [
{
"terms": {
"Brand": [
"Prada"
]
}
}
]
}
}
}
},
"post_filter": {
"bool": {
"filter": [
{
"terms": {
"Brand": [
"Prada"
]
}
},
{
"terms": {
"Style": [
"Evening"
]
}
}
]
}
}
}

Let’s quickly run some tests to understand what we are expecting our results to be. First, let’s check how many total dresses our dataset contains:

curl --location --request GET 'localhost:9200/faceted_navigation/_count' \
--header 'Content-Type: application/json' \
--data-raw '{
"query": {
"bool": {
"must": [
{
"match": {
"Category": "Dress"
}
}
]
}
}
}'

This returns (it is only true for the data I generated, your results, if you repeat my experiment, might be different):

{
"count": 30,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
}
}

In other words, we expect the values in the Brand and the Style buckets to sum up to 30.

Now let’s check how many Prada evening dresses does our dataset have:

curl --location --request GET 'localhost:9200/faceted_navigation/_count' \
--header 'Content-Type: application/json' \
--data-raw '{
"query": {
"bool": {
"must": [
{
"match": {
"Category": "Dress"
}
},
{
"match": {
"Brand": "Prada"
}
},
{
"match": {
"Style": "Evening"
}
}
]
}
}
}'

This returns:

{
"count": 2,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
}
}

Great, we only have two matches, so it will be easy to test our results.

And here are the final results of our faceted query:

{
"took": 4,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": 0.51409894,
"hits": [
{
"_index": "faceted_search",
"_type": "_doc",
"_id": "16",
"_score": 0.51409894,
"_source": {
"Category": "Dress",
"Color": "Blue",
"Style": "Evening",
"Brand": "Prada",
"Size": "L",
"id": 16
}
},
{
"_index": "faceted_search",
"_type": "_doc",
"_id": "36",
"_score": 0.51409894,
"_source": {
"Category": "Dress",
"Color": "Green",
"Style": "Evening",
"Brand": "Prada",
"Size": "XL",
"id": 36
}
}
]
},
"aggregations": {
"Brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Prada",
"doc_count": 8
},
{
"key": "Chanel",
"doc_count": 6
},
{
"key": "Hermes",
"doc_count": 6
},
{
"key": "Armani",
"doc_count": 5
},
{
"key": "Fendi",
"doc_count": 5
}
]
},
"Size": {
"doc_count": 2,
"Size": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "L",
"doc_count": 1
},
{
"key": "XL",
"doc_count": 1
}
]
}
},
"Color": {
"doc_count": 2,
"Color": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Blue",
"doc_count": 1
},
{
"key": "Green",
"doc_count": 1
}
]
}
},
"Style": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Maxi",
"doc_count": 11
},
{
"key": "Sheath",
"doc_count": 11
},
{
"key": "Evening",
"doc_count": 5
},
{
"key": "Shift",
"doc_count": 3
}
]
}
}
}

We do indeed have only two hits and they are both Prada evening dresses. The values in the Brand bucket are: Prada — 8, Chanel — 6, Hermes — 6 , Armani — 5, Fendi — 5. The values in the Style bucket are: Maxi — 11, Sheath — 11, Evening — 5, Shift — 3. In both buckets the total is 30, so we are good. In the other two buckets the values, as expected, sum up to 2, which means that our filters were applied.

Mission accomplished.

--

--