当前位置: 首页 > article >正文

ES索引备份

#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
/**************************************************************
**************************************************************/
获取ES中所有的文档数据
filename data_es.py
python3
"""

import sys
import json
import requests

# Python 3 默认字符串类型已经是Unicode,不需要设置默认编码

host = "10.233.54.21"
port = 9200

def dump_es_ids(scroll_name, scroll_id):
    """
    循环发送请求获取所有es
    :return:
    """
    url = f"http://{host}:{port}/_search/scroll"
    data = {
        "scroll": scroll_name,
        "scroll_id": scroll_id
    }

    header = {"Content-Type": "application/json"}
    response = requests.post(url, json=data, headers=header)
    response.raise_for_status()  # 检查HTTP请求是否成功
    return response.json()

def get_scroll(size, scroll_name, index):
    """
    获取第一批数据和scroll
    :return:
    """
    url = f"http://{host}:{port}/{index}/_search?scroll={scroll_name}"
    data = {
        "size": size,
        "query": {
            "match_all": {}
        }
    }

    header = {"Content-Type": "application/json"}
    response = requests.post(url, json=data, headers=header)
    response.raise_for_status()  # 检查HTTP请求是否成功
    return response.json()

def has_more(result_obj, index):
    """
    查看是否还有更多数据
    :param result_obj:
    :return:
    """
    try:
        obj_list = result_obj["hits"]["hits"]
        if len(obj_list) > 0:
            print(f"[INFO] index {index} has more data")
            return True
        else:
            return False
    except Exception as e:
        print(f"[ERROR] got error {e}")
        return False

def get_id_list(result_obj):
    """
    从结果集中获取id列表
    :param result_obj:
    :return:
    """
    obj_list = result_obj["hits"]["hits"]
    id_list = [item["_source"] for item in obj_list]
    return id_list

def main():
    """
    获取文档的所有id
    :return:
    """
    index = sys.argv[1]
    out_file = sys.argv[2]
    scroll_name = "5m"
    size = 1000
    scroll_obj = get_scroll(size, scroll_name, index)
    scroll_id = scroll_obj["_scroll_id"]
    print(f"[INFO] scroll_id is {scroll_id}")
    result_obj = scroll_obj
    counter = 0
    with open(out_file, "w") as out:
        while has_more(result_obj, index):
            counter += len(result_obj["hits"]["hits"])
            result_obj_list = get_id_list(result_obj)
            print(f"[INFO] index {index} get data length {len(result_obj_list)}")
            result_obj = dump_es_ids(scroll_name, scroll_id)
            print(f"[INFO] index {index} list total length {len(result_obj_list)}")
            print(f"[INFO] index {index} now total logs {counter}")

            for obj in result_obj_list:
                out.write(f"{json.dumps(obj, ensure_ascii=False)}\n")

if __name__ == '__main__':
    main()




host = "10.92.204.60" # 修改成对应elasticsearch-master svc的IP   ,  port = 9200
# 找到svc
kubectl get svc -n mpks | grep elasticsearch

# 先查看全文索引(导出的索引需要跟rd和drd确认)
curl http://ELASTICSEARCH_MASTER_IP:9200/_cat/indices | grep "fulltext" 

备份执行  curl http://10.233.54.21:9200/_cat/indices | grep "fulltext" | awk -F" " '{if($7>0)print $3}'|awk '{print "touch ",$1," && python3 data_es.py ",$1," ",$1 }' |bash

常见问题:未找到request库
# 方式1pip install request -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn

#方式2  去官网下载https://pypi.org/project/requests/#files 然后到解压后的目录去执行:

python setup.py install

http://www.kler.cn/a/331566.html

相关文章:

  • 浅谈目前我开发的前端项目用到的设计模式
  • AI开发-语料-“self-instruct”
  • Linux文件:动静态库制作 动态库链接原理解析
  • 【Vulkan入门】16-IndexBuffer
  • Spring Boot中Bean的 构造器注入、字段注入和方法注入
  • ChatGPT等大语言模型与水文水资源、水环境领域的深度融合
  • 关于建表字段是否该使用 `NOT NULL` 的问题,你怎么看?
  • ubuntu命令行连接wifi
  • Hive数仓操作(十二)
  • C++ 语言特性13 - 强枚举类型
  • IP 数据包分包组包
  • mit6824-01-MapReduce详解
  • 解决 TypeError: Expected state_dict to be dict-like, got <class ‘*‘>.
  • 在 Ubuntu 下通过 Docker 部署 NAS 服务器
  • 损失函数篇 | YOLOv5 引入Unified-IoU 高质量目标检测IoU损失
  • Vue3项目开发——新闻发布管理系统(九)(完结篇)
  • 项目-坦克大战学习-资源冲突解决
  • 算法 | 鹈鹕算法POA-Transformer-LSTM多变量回归预测
  • redis 5的安装及启动(window)
  • csapp_计算机系统通览
  • 数据校验的总结
  • 《开源大模型食用指南》,一杯奶茶速通大模型!新增Examples最佳实践!
  • 【pytorch】pytorch入门5:最大池化层(Pooling layers )
  • SSY20241002提高组T4题解__纯数论
  • nginx配置多域名共用服务器80端口
  • ICM20948 DMP代码详解(60)