使用 可以用 SQL 进行 elasticsearch 的查询。下钻未必一定是GROUP BY,每记过一层GROUP BY,桶的数量就会增加一层。还有一种下钻是用 filter 来下钻,这种下钻方式会使下一层的 match的文档数量变少,但是桶的数量不变。
SELECT INSIDE WHERE ipo_year=2000
SQL
$ cat << EOF | ./es_query.py http://127.0.0.1:9200 WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol); WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols WHERE ipo_year=2000)EOF
{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0}
可以看到桶的数量还是同一个,只是增加了一个max_at_2000结果字段。
Elasticsearch{ "aggs": { "year_2000": { "filter": { "term": { "ipo_year": 2000 } }, "aggs": { "max_at_2000": { "max": { "field": "market_cap" } } } }, "max_all_times": { "max": { "field": "market_cap" } } }, "size": 0}
{ "hits": { "hits": [], "total": 6714, "max_score": 0.0 }, "_shards": { "successful": 1, "failed": 0, "total": 1 }, "took": 3, "aggregations": { "year_2000": { "max_at_2000": { "value": 20310000000.0 }, "doc_count": 58 }, "max_all_times": { "value": 522690000000.0 } }, "timed_out": false}
Profile
[ { "query": [ { "query_type": "TermQuery", "lucene": "ipo_year:`P", "time": "0.3492430000ms", "breakdown": { "score": 0, "create_weight": 220149, "next_doc": 0, "match": 0, "build_scorer": 95037, "advance": 34057 } }, { "query_type": "MatchAllDocsQuery", "lucene": "*:*", "time": "0.1883710000ms", "breakdown": { "score": 0, "create_weight": 3980, "next_doc": 169730, "match": 0, "build_scorer": 14661, "advance": 0 } } ], "rewrite_time": 3158, "collector": [ { "name": "MultiCollector", "reason": "search_multi", "time": "8.789594000ms", "children": [ { "name": "TotalHitCountCollector", "reason": "search_count", "time": "0.2768050000ms" }, { "name": "BucketCollector: [[year_2000, max_all_times]]", "reason": "aggregation", "time": "7.667765000ms" } ] } ] }]
从Profile的结果来看,其实是一次性做了两个查询,一个是全部文档,一个是ipo_year=2000的文档,然后统一聚合。
SELECT INSIDE 往不同方向过滤下钻
SQL
$ cat << EOF | ./es_query.py http://127.0.0.1:9200 WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol); WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols WHERE ipo_year=2000); WITH year_2001 AS (SELECT MAX(market_cap) AS max_at_2001 FROM all_symbols WHERE ipo_year=2001)EOF
这个写法其实和 CASE WHEN 很类似,但是其表达能力更强大,更灵活。
$ cat << EOF | ./es_query.py http://127.0.0.1:9200 SELECT per_ipo_year, MAX(market_cap) AS max_all_times FROM symbol GROUP BY CASE WHEN ipo_year=2000 THEN 'year_2000' WHEN ipo_year=2001 THEN 'year_2001' END AS per_ipo_yearEOF
上面的sql查询结果是
{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001": 8762940000.0}
Elasticsearch
{ "aggs": { "year_2001": { "filter": { "term": { "ipo_year": 2001 } }, "aggs": { "max_at_2001": { "max": { "field": "market_cap" } } } }, "year_2000": { "filter": { "term": { "ipo_year": 2000 } }, "aggs": { "max_at_2000": { "max": { "field": "market_cap" } } } }, "max_all_times": { "max": { "field": "market_cap" } } }, "size": 0}
{ "hits": { "hits": [], "total": 6714, "max_score": 0.0 }, "_shards": { "successful": 1, "failed": 0, "total": 1 }, "took": 2, "aggregations": { "year_2001": { "max_at_2001": { "value": 8762940000.0 }, "doc_count": 38 }, "year_2000": { "max_at_2000": { "value": 20310000000.0 }, "doc_count": 58 }, "max_all_times": { "value": 522690000000.0 } }, "timed_out": false}
Profile
[ { "query": [ { "query_type": "TermQuery", "lucene": "ipo_year:`Q", "time": "0.2518270000ms", "breakdown": { "score": 0, "create_weight": 186032, "next_doc": 0, "match": 0, "build_scorer": 48664, "advance": 17131 } }, { "query_type": "TermQuery", "lucene": "ipo_year:`P", "time": "0.1200760000ms", "breakdown": { "score": 0, "create_weight": 77254, "next_doc": 0, "match": 0, "build_scorer": 25184, "advance": 17638 } }, { "query_type": "MatchAllDocsQuery", "lucene": "*:*", "time": "0.1968800000ms", "breakdown": { "score": 0, "create_weight": 3573, "next_doc": 180136, "match": 0, "build_scorer": 13171, "advance": 0 } } ], "rewrite_time": 4250, "collector": [ { "name": "MultiCollector", "reason": "search_multi", "time": "2.459413000ms", "children": [ { "name": "TotalHitCountCollector", "reason": "search_count", "time": "0.2160950000ms" }, { "name": "BucketCollector: [[year_2001, year_2000, max_all_times]]", "reason": "aggregation", "time": "1.455703000ms" } ] } ] }]
SELECT INSIDE 往同一方向连续过滤下钻
SQL
$ cat << EOF | ./es_query.py http://127.0.0.1:9200 WITH SELECT MAX(market_cap) AS max_all_times FROM symbol AS all_symbols; WITH SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols WHERE ipo_year=2000 AS year_2000; WITH SELECT MAX(market_cap) AS max_at_2001_finance FROM year_2000 WHERE sector='Finance' AS year_2000_financeEOF
{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001_finance": 985668354.0}
Elasticsearch
{ "aggs": { "year_2000": { "filter": { "term": { "ipo_year": 2000 } }, "aggs": { "max_at_2000": { "max": { "field": "market_cap" } }, "year_2000_finance": { "filter": { "term": { "sector": "Finance" } }, "aggs": { "max_at_2001_finance": { "max": { "field": "market_cap" } } } } } }, "max_all_times": { "max": { "field": "market_cap" } } }, "size": 0}
{ "hits": { "hits": [], "total": 6714, "max_score": 0.0 }, "_shards": { "successful": 1, "failed": 0, "total": 1 }, "took": 2, "aggregations": { "year_2000": { "max_at_2000": { "value": 20310000000.0 }, "year_2000_finance": { "max_at_2001_finance": { "value": 985668354.0 }, "doc_count": 2 }, "doc_count": 58 }, "max_all_times": { "value": 522690000000.0 } }, "timed_out": false}
Profile
[ { "query": [ { "query_type": "TermQuery", "lucene": "ipo_year:`P", "time": "0.1897790000ms", "breakdown": { "score": 0, "create_weight": 145762, "next_doc": 0, "match": 0, "build_scorer": 26216, "advance": 17801 } }, { "query_type": "TermQuery", "lucene": "sector:Finance", "time": "0.2380290000ms", "breakdown": { "score": 0, "create_weight": 57770, "next_doc": 0, "match": 0, "build_scorer": 55497, "advance": 124762 } }, { "query_type": "MatchAllDocsQuery", "lucene": "*:*", "time": "0.1965630000ms", "breakdown": { "score": 0, "create_weight": 3500, "next_doc": 178347, "match": 0, "build_scorer": 14716, "advance": 0 } } ], "rewrite_time": 4190, "collector": [ { "name": "MultiCollector", "reason": "search_multi", "time": "2.466917000ms", "children": [ { "name": "TotalHitCountCollector", "reason": "search_count", "time": "0.2712430000ms" }, { "name": "BucketCollector: [[year_2000, max_all_times]]", "reason": "aggregation", "time": "1.370663000ms" } ] } ] }]
有了 GROUP BY 下钻和 FILTER 下钻,很多复杂的查询可以一条就查询出来。而这种边下钻边聚合指标的查询能力甚至是传统SQL都不具备的。而且稍微训练一下,就会非常习惯这种下钻的思维方式,写查询也会很自然。