本文为博主原创,未经授权,严禁转载及使用。
本文链接:https://blog.csdn.net/zyooooxie/article/details/109588072
前面刚写了 requests发请求 操作Elasticsearch - Search https://blog.csdn.net/zyooooxie/article/details/123730279,再来分享下 使用elasticsearch库 ;
【实际这篇博客推迟发布N个月】
个人博客:https://blog.csdn.net/zyooooxie
【以下所有内容仅为个人项目经历,如有不同,纯属正常】
Python Client
https://www.elastic.co/guide/en/elasticsearch/client/index.html
我使用的 是 7.17.0;
https://pypi.org/project/elasticsearch/7.17.0/
https://www.elastic.co/guide/en/elasticsearch/client/python-api/7.17/overview.html
https://elasticsearch-py.readthedocs.io/en/v7.17.0/index.html
"""
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: zyooooxie@gmail.com
"""
import time
import traceback
import sys
import json
import string
import math
import random
from typing import Optional, Union, List, Any
from user_log import Log
from elasticsearch import Elasticsearch
from elasticsearch.helpers import BulkIndexError
gl_es_host_new = 'http://1.1.1.1:1111'
gl_es_host_new_2 = ['http://1.1.1.1:1111', 'http://2.2.2.2:2222']
# ``port`` needs to be an int.
gl_es_host_new_3 = [{'host': '2.2.2.2', 'port': 2222}]
gl_es_host_new_4 = [{'host': '2.2.2.2', 'port': 2222}, {'host': '1.1.1.1', 'port': 1111}]
gl_es_auth = ('es_username', 'es_password')
gl_type = '_doc'
gl_search_dict = {'size': 100, 'from': 0, "sort": {"xxxXXX": {"order": "desc"}}}
# pip install elasticsearch==7.17.0
# https://pypi.org/project/elasticsearch/7.17.0/
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/docs.html
# https://elasticsearch-py.readthedocs.io/en/v7.17.0/api.html
# doc_type 不建议使用 【Specifying types in requests is deprecated】
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/removal-of-types.html
# Note that in 7.0, _doc is a permanent part of the path, and represents the endpoint name rather than the document type.
# In Elasticsearch 7.0, each API will support typeless requests, and specifying a type will produce a deprecation warning.
搜索
"""
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: zyooooxie@gmail.com
"""
def connect_es_client(hosts: Union[str, list], auth: tuple):
"""
:param hosts:
:param auth:
:return:
"""
client = Elasticsearch(hosts,
sniff_on_start=True, # sniff before doing anything
sniff_on_node_failure=True, # refresh nodes after a node fails to respond
request_timeout=60,
http_auth=auth) # HTTP authentication uses the http_auth parameter by passing in a username and password within a tuple
Log.error('连接-{}'.format(client))
return client
def close_es_client(client: Elasticsearch):
"""
:param client:
:return:
"""
client.close()
Log.error('断开连接')
def _es_search(index_str: str, client: Elasticsearch,
size_: int = 10000, from_: int = 0,
sort_: Union[str, dict] = {"seq": {"order": "desc"}},
get_more_10000: bool = False,
**kwargs):
"""
:param index_str:
:param client:
:param size_:
:param from_:
:param sort_: query 传值是 {"seq": {"order": "desc"}} ; body 是 'seq:desc';
:param get_more_10000:是否查询超过10000条的数据
:param kwargs: 不建议使用 body传参;查全部时,啥都不传;
:return:
"""
# 索引不存在时,返回值是 None
if not client.indices.exists(index=index_str):
return None
# from + size must be less than or equal to: [10000]
assert size_ + from_ <= 10000
# # ✅ New usage:
# es.search(query={...})
#
# # ❌ Deprecated usage:
# es.search(body={"query": {...}})
Log.debug(locals())
# search() 的 from: Defaults to 0. size: Defaults to 10.
# 但有时候为了查出来所有数据,size 默认给 最大值10000,from 默认给0;
res = client.search(index=index_str, size=size_, from_=from_, sort=sort_, **kwargs)
total = res.get('hits').get('total').get('value')
Log.info(f'total:{total}')
hits_len = len(res.get('hits').get('hits'))
Log.info(f'hits有:{hits_len}条')
result = _search_10000(hits_len=hits_len, first_search_result=res, locals_=locals(),
client=client, first_search_size=size_, get_more_10000=get_more_10000)
Log.info(result[-10:])
Log.info(f'search返回的结果有:{len(result)}条')
return result
def _search_10000(client: Elasticsearch, hits_len: int, first_search_result: dict, locals_: dict,
first_search_size: int,
get_more_10000: bool = False):
"""
:param client:
:param hits_len:
:param first_search_result:
:param locals_:
:param first_search_size:
:param get_more_10000:
:return:
"""
if hits_len < first_search_size or not get_more_10000:
if hits_len:
return first_search_result.get('hits').get('hits')
else:
return []
else:
return __search_10000_get_result(client=client, locals_=locals_)
def __search_10000_get_result(client: Elasticsearch, locals_: dict):
"""
:param client:
:param locals_:
:return:
"""
from xxx_use.common_functions import compare_dict_key
one_choice = random.getrandbits(2)
Log.info(one_choice)
if not one_choice:
Log.info('scroll + scan')
scroll_list = __scroll(client=client, locals_=locals_)
scan_list = __scan(client=client, locals_=locals_)
# 很多时候 因为sort值不同
scroll_list = __change_before_compare(scroll_list)
scan_list = __change_before_compare(scan_list)
compare_dict_key(scroll_list, scan_list, assert_0=True)
compare_dict_key(scan_list, scroll_list, assert_0=True)
return scroll_list
elif one_choice == 1:
Log.info('scroll')
return __scroll(client=client, locals_=locals_)
elif one_choice == 2:
Log.info('scan')
return __scan(client=client, locals_=locals_)
else:
# return __limit(client=client, locals_=locals_)
# 不推荐
Log.info('指定seq范围 【自己造的假数据 确保 每条都有seq】')
limit_list = __limit(client=client, locals_=locals_)
scan_list = __scan(client=client, locals_=locals_)
limit_list = __change_before_compare(limit_list)
scan_list = __change_before_compare(scan_list)
compare_dict_key(limit_list, scan_list, assert_0=True)
compare_dict_key(scan_list, limit_list, assert_0=True)
return limit_list
def __change_before_compare(result_list: list):
"""
scroll + scan 结果比较前 对数据做个统一
:param result_list:
:return:
"""
for rl in result_list:
# 每个结果还有一个 _score ,它衡量了文档与查询的匹配程度。默认情况下,首先返回最相关的文档结果,就是说,返回的文档是按照 _score 降序排列的。
rl.pop('sort', '不存在key')
rl.pop('_score', '不存在key')
return result_list
def __scan(client: Elasticsearch, locals_: dict):
"""
:param client:
:param locals_:
:return:
"""
# https://elasticsearch-py.readthedocs.io/en/v7.17.0/helpers.html#scan
from elasticsearch.helpers import scan
# query 要传的是 body for the search() api
# query={"query": {"match": {"blog": "zyooooxie"}}}
result = scan(client=client, index=locals_.get('index_str'), query=locals_.get('kwargs'),
size=5000,
scroll="3m") # Any additional keyword arguments will be passed to the initial search() call
Log.info(f'{result}, {type(result)}')
res = [gr for gr in result]
Log.info(len(res))
return res
def __scroll(client: Elasticsearch, locals_: dict):
"""
:param client:
:param locals_:
:return:
"""
# https://elasticsearch-py.readthedocs.io/en/v7.17.0/api.html#elasticsearch.Elasticsearch.scroll
scroll_time = '3m'
search_res = client.search(index=locals_.get('index_str'), scroll=scroll_time,
query=locals_.get('kwargs').get('query'),
size=5000,
sort=['_doc'])
scroll_id = search_res.get('_scroll_id')
Log.info(scroll_id)
total = search_res.get('hits').get('total').get('value')
Log.info(f'总共有{total}条')
res = search_res.get('hits').get('hits')
while True:
scroll_res = client.scroll(scroll_id=scroll_id, scroll=scroll_time)
scroll_id = scroll_res.get('_scroll_id')
data = scroll_res.get('hits').get('hits')
res.extend(data)
if not data:
break
assert total == len(res)
# Search context are automatically removed when the scroll timeout has been exceeded.
# 手动清理,using the clear-scroll API
clear_res = client.clear_scroll(scroll_id=scroll_id)
Log.info(clear_res)
return res
def __limit(client: Elasticsearch, locals_: dict):
"""
:param client:
:param locals_:
:return:
"""
seq_max: int = get_seq_max(client=client, index_str=locals_.get('index_str'))
query = locals_.get('kwargs').get('query')
search_size = 10000 # search的传参 取最大
limit_size = 5000 # 查询时 以seq排序,每次取的长度
assert limit_size <= search_size
res = list()
for i in range(math.ceil(seq_max / limit_size)):
query_new = {'bool': {'must': [
query,
{'range': {'seq': {'gt': limit_size * i, 'lte': limit_size * (i + 1)}}} # gt、lte
]
}}
# Log.info(query_new)
search_res = client.search(index=locals_.get('index_str'),
query=query_new,
size=search_size)
data = search_res.get('hits').get('hits')
res.extend(data)
else:
Log.info(len(res))
return res
本文链接:https://blog.csdn.net/zyooooxie/article/details/109588072
个人博客 https://blog.csdn.net/zyooooxie