当前位置：首页 > article >正文

51.MongoDB聚合操作与索引使用详解

article 2025/1/31 13:55:48

聚合操作

聚合操作允许用户处理多个文档并返回计算结果。

从效果而言，聚合框架相当于 SQL 查询中的GROUP BY、 LEFT OUTER JOIN 、 AS等。

聚合管道

整个聚合运算过程称为管道（Pipeline），由多个阶段（Stage）组成

原始数据集 -----------Stage---------》中间结果 ------------Stage--------》中间结果 --------------------》结果集

常用的聚合阶段运算符

$match 筛选条件
$project 投影
$lookup 左外连接
$sort 排序
$group 分组
- $g ro u p 阶段的内存限制为 100 M 。默认情况下，如果 s t a g e 超过此限制，$ group将产生错误。但是，要允许处理大型数据集，请将allowDiskUse选项设置为true以启用$group操作以写入临时文件。
$s ki p /$ limit 分页
$unwind 展开数组
$graphLookup 图搜索
- 主要用来实现多表关联查询，相当关系型数据库中多表关联查询。
$f a ce t /$ bucket 分面搜索

#投影操作，将原始字段投影成指定名称，可以灵活控制输出文档的格式
db.books.aggregate([{$project:{name:"$title",_id:0,type:1,author:1}}])

#$match用于对文档进行筛选,尽可能将$match放在管道的前面位置,可以使用索引,过滤掉不需要的文档
db.books.aggregate([{$match:{type:"technology"}}])
#type:0导致的错误 Invalid $project :: caused by :: Cannot do exclusion on field type in inclusion projection
#$match和$project结合
db.books.aggregate([
    {$match:{type:"technology"}},
    {$project:{name:"$title",_id:0,type:1,author:{name:1}}}
])

#$count
db.books.aggregate([
    {$match:{type:"technology"}},
    {$count: "type_count"}
])

#$group 
#accumulator操作符 $avg $first $last $max $min $push $addToSet $sum $stdDevPop $stdDevSamp
#book的数量，收藏总数和平均值
db.books.aggregate([
    {$group:{_id:null,count:{$sum:1},pop:{$sum:"$favCount"},avg:{$avg:"$favCount"}}}
])
#统计每个作者的book收藏总数
db.books.aggregate([
    {$group:{_id:"$author.name",pop:{$sum:"$favCount"}}}
])
#统计每个作者的每本book的收藏数
db.books.aggregate([
    {$group:{_id:{name:"$author.name",title:"$title"},pop:{$sum:"$favCount"}}}
])
#每个作者的book的type合集
db.books.aggregate([
    {$group:{_id:"$author.name",types:{$addToSet:"$type"}}}
])
#姓名为xx006的作者的book的tag数组拆分为多个文档
db.books.aggregate([
    {$match:{"author.name":"xx006"}},
    {$unwind:"$tag"}
])

#每个作者的book的tag合集
db.books.aggregate([
    {$unwind:"$tag"},
    {$group:{_id:"$author.name",types:{$addToSet:"$tag"}}}
])
# 使用includeArrayIndex选项来输出数组元素的数组索引
db.books.aggregate([
    {$match:{"author.name":"fox"}},
    {$unwind:{path:"$tag", includeArrayIndex: "arrayIndex"}}
])
# 使用preserveNullAndEmptyArrays选项在输出中包含缺少size字段，null或空数组的文档,防止漏掉文档
db.books.aggregate([
    {$match:{"author.name":"fox"}},
    {$unwind:{path:"$tag", preserveNullAndEmptyArrays: true}}
])

#$limit 限制传递到管道中下一阶段的文档数
db.books.aggregate([
    {$limit : 5 }
])

#$skip 跳过进入stage的指定数量的文档，并将其余文档传递到管道中的下一个阶段
db.books.aggregate([
     {$skip : 50 }
])

#$sort 对所有输入文档进行排序，并按排序顺序将它们返回到管道。
db.books.aggregate([
	{$skip : 50 }
    {$sort : {favCount:-1,"author.age":1}}
])

#$lookup 每个输入待处理的文档，经过$lookup 阶段的处理，输出的新文档中会包含一个新生成的数组

db.customer.aggregate([        
    {$lookup: {
       from: "order",
       localField: "customerCode",
       foreignField: "customerCode",
       as: "customerOrder"
     }
    } 
])
db.order.aggregate([
    {$lookup: {
               from: "customer",
               localField: "customerCode",
               foreignField: "customerCode",
               as: "curstomer"
             }
        
    },
    {$lookup: {
               from: "orderItem",
               localField: "orderId",
               foreignField: "orderId",
               as: "orderItem"
             }
    }
])


#标签的热度排行，标签的热度则按其关联book文档的收藏数（favCount）来计算
db.books.aggregate([
    {$match:{favCount:{$gt:0}}},
    {$unwind:"$tag"},
    {$group:{_id:"$tag",total:{$sum:"$favCount"}}},
    {$sort:{total:-1}}
])


#$bucket
db.books.aggregate([{
    $bucket:{
        groupBy:"$favCount",
        boundaries:[0,10,60,80,100],
        default:"other",
        output:{"count":{$sum:1}}
    }
}])


#使用mongoimport工具导入数据
mongoimport -h 192.168.139.150 -d test -u lywtimer -p 0918 --authenticationDatabase=admin -c zips --file D:\mongodb-database-tools-windows-x86_64-100.9.0\zips.json  

#返回人口超过1000万的州
db.zips.aggregate( [
   { $group: { _id: "$state", totalPop: { $sum: "$pop" } } },
   { $match: { totalPop: { $gte: 10*1000*1000 } } }
] )

聚合优化

尽可能利用索引完成搜索和排序
尽早尽多减少数据量
尽可能减少执行步骤

MongoDB索引详解

MongoDB采用B+Tree 做索引，索引创建colletions上。

索引类型

单键索引
复合索引
多键(数组)索引
Hash索引
地理空间索引
全文索引
通配符索引
- MongoDB的文档模式是动态变化的，而通配符索引可以建立在一些不可预知的字段上，以此实现查询的加速。MongoDB 4.2 引入了通配符索引来支持对未知或任意字段的查询。

索引属性

唯一索引
部分索引
稀疏索引
TTL索引
隐藏索引
- 通过对规划器隐藏索引，用户可以在不实际删除索引的情况下评估删除索引的潜在影响。如果影响是负面的，用户可以取消隐藏索引，而不必重新创建已删除的索引。

索引使用建议

为每一个查询建立合适的索引
创建合适的复合索引，不要依赖于交叉索引
复合索引字段顺序：匹配条件在前，范围条件在后
尽可能使用覆盖索引
建索引要在后台运行
避免设计过长的数组索引

# 创建单键索引
db.books.createIndex({title:1})
# 对内嵌文档字段创建索引：
db.books.createIndex({"author.name":1})
# 创建复合索引
db.books.createIndex({type:1,favCount:1})
#查看执行计划  winningPlan: {stage: 'COLLSCAN'    winningPlan: {stage: 'FETCH'
db.books.find({type:"novel",favCount:{$gt:50}}).explain()
#查看索引信息
db.books.getIndexes()
#查看索引键
db.books.getIndexKeys()
#删除集合指定索引
db.col.dropIndex("索引名称")
#删除集合所有索引   不能删除主键索引
db.col.dropIndexes()
#创建 hash 索引
db.users.createIndex({username : 'hashed'})
#地理空间索引（2dsphereindex）就是专门用于实现位置检索的一种特殊索引
db.restaurant.createIndex({location : "2dsphere"})
#全文检索
db.reviews.createIndex( { comments: "text" } )
#通配符索引
db.products.createIndex( { "product_attributes.$**" : 1 } )
# 复合索引支持唯一性约束
db.values.createIndex({title:1，type:1},{unique:true})
#部分索引仅对满足指定过滤器表达式的文档进行索引
db.restaurants.createIndex(
   { cuisine: 1, name: 1 },
   { partialFilterExpression: { rating: { $gt: 5 } } }
)
#不索引不包含xmpp_id字段的文档
db.addresses.createIndex( { "xmpp_id": 1 }, { sparse: true } )
# 创建 TTL 索引，TTL 值为3600秒
db.eventlog.createIndex( { "lastModifiedDate": 1 }, { expireAfterSeconds: 3600 } )
#创建隐藏索引
db.restaurants.createIndex({ borough: 1 },{ hidden: true });
# 隐藏现有索引
db.restaurants.hideIndex( { borough: 1} );
db.restaurants.hideIndex( "索引名称" )
# 取消隐藏索引
db.restaurants.unhideIndex( { borough: 1} );
db.restaurants.unhideIndex( "索引名称" );