说明:该脚本的开发思路和使用方式
原理:读取数据-> 转换为csv-> load 数据到hdfs (hive临时表) -> 通过sql 加载到使用的表中
使用该脚本数据读取数据最好使用单个索引,如果用通配符*匹配比较多的索引,存在速率过慢和机器内存使用过度的风险。
import csvfrom pyhive import hivefrom hdfs import InsecureClientfrom elasticsearch import Elasticsearchdef conn2es(): es = Elasticsearch(["xxx.xxx.xxx.xxx"],port=9200,http_auth=('*','*')) return es# 查询def query(): es = conn2es() body = { "query" : { "range" : { "logoutTime" : { "gte": "2021-01-11T00:00:00.000+08:00", "lte": "2021-01-12T00:59:59.999+08:00" } } } } query = es.search(index="index_name", doc_type='type_name', body=body, scroll='5m', size=1000) res = query['hits']['hits'] # es查询出的结果第一页 total = query['hits']['total'] # es查询出的结果总量 scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果 for i in range(0, int(total / 1000) + 1): # scroll参数必须指定否则会报错 query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits'] res += query_scroll emp_data = [] ## 指定查询mapping key keys_data = ['onlineDetailUuid', 'userId', 'operatorsConfigUuid', 'operatorsUserId', 'operatorsName', 'gatewayStrategy', 'userMac', 'usergroupId', 'userIpv4', 'userIpv6', 'userIpv6Locallink', 'ipv6Num', 'userVlan', 'ceVlan', 'peVlan', 'domainName', 'netMask', 'gateway', 'ipv6Gateway', 'dns', 'nasIp', 'nasIpv6', 'nasPort', 'nasCommunity', 'nasName', 'nasLocation', 'nasType', 'nasTypeNum', 'serviceId', 'policyId', 'accountId', 'wpNasIp', 'wpNasPort', 'proxyName', 'accessCtrl', 'loginTime', 'logoutTime', 'onlineSec', 'terminateCause', 'tunnelClient', 'tunnelServer', 'accessType', 'apMac', 'ssid', 'isRoaming', 'areaName', 'acctSessionId', 'totalTraffic', 'userTemplateUuid', 'createTime', 'lastUpdateTime', 'createHost', 'lastUpdateHost', 'packageName', 'serviceSuffix', 'accountingRule', 'timesegmentId', 'suConnVersion', 'suVersion', 'accessEquipmentType', 'terminalOperationSystem', 'terminalTypeDes', 'visitorAuthType', 'isOperatorsAuth', 'hasReset', 'operatorsSessionId', 'passThroughType', 'packetUuid', 'accessDeviceIp', 'accessDeviceType', 'accessDevicePort', 'accessDeviceDescription', 'accessDeviceInterface', 'operatorFirstLog'] for hit in res: data = hit['_source'] csv_data = [] for key_data in keys_data: if key_data in data: csv_data.append(data[key_data]) else: csv_data.append('null') emp_data.append(csv_data) with open('emp_data.csv','w') as csvfile : writer = csv.writer(csvfile) writer.writerows(emp_data)def hdfsWriteFile() : conn = hive.Connection(host='xxx.xxx.xxx.xxx', port=10000, database = 'test') cursor = conn.cursor() hdfs_client = InsecureClient('http://ns1:50070') emp_files = hdfs_client.list(hdfs_path='/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/') #判断是否有文件存在,有则删除 if len(emp_files) != 0: hdfs_client.delete(hdfs_path= '/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/' + emp_files[0]) remote_path = hdfs_client.upload(hdfs_path='/user/hive/warehouse/maoyan.db/ods__ck__sam_es_di_new_tmp/', local_path='emp_data.csv', n_threads=2,chunk_size=2 ** 16) if remote_path != None : cursor.execute("insert overwrite table test.ods__ck__sam_es_di_new partition(pt='20210204') select * from test.ods__ck__sam_es_di_new_tmp") else : return '文件上传失败' cursor.close() conn.close() return remote_pathdef main(): query() hdfs_path = hdfsWriteFile() print(hdfs_path)if __name__ == '__main__': main()