我有一个索引,其中很多纸在同一字段中具有相同的值。在这一领域,我有一个重复数据删除技术。
聚合器将作为计数器来找我。我想要一份文件清单。
我的索引:
我想要这个结果(按域字段的重复数据删除结果):
您可以使用字段折叠,将结果分组到name字段上并将top_hits聚合器的大小设置为1。
name
top_hits
/POST http://localhost:9200/test/dedup/_search?search_type=count&pretty=true { "aggs":{ "dedup" : { "terms":{ "field": "name" }, "aggs":{ "dedup_docs":{ "top_hits":{ "size":1 } } } } } }
这将返回:
{ "took" : 192, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "failed" : 0 }, "hits" : { "total" : 6, "max_score" : 0.0, "hits" : [ ] }, "aggregations" : { "dedup" : { "buckets" : [ { "key" : "name1", "doc_count" : 2, "dedup_docs" : { "hits" : { "total" : 2, "max_score" : 1.0, "hits" : [ { "_index" : "test", "_type" : "dedup", "_id" : "1", "_score" : 1.0, "_source":{domain: "domain1.fr", name: "name1", date: "01-01-2014"} } ] } } }, { "key" : "name2", "doc_count" : 2, "dedup_docs" : { "hits" : { "total" : 2, "max_score" : 1.0, "hits" : [ { "_index" : "test", "_type" : "dedup", "_id" : "3", "_score" : 1.0, "_source":{domain: "domain1.fr", name: "name2", date: "01-03-2014"} } ] } } }, { "key" : "name3", "doc_count" : 2, "dedup_docs" : { "hits" : { "total" : 2, "max_score" : 1.0, "hits" : [ { "_index" : "test", "_type" : "dedup", "_id" : "5", "_score" : 1.0, "_source":{domain: "domain1.fr", name: "name3", date: "01-05-2014"} } ] } } } ] } } }