用elasticsearch分析中国大学省份分布-白红宇

用elasticsearch分析中国大学省份分布

阅读量：4322 次

发布时间：2019-06-06

本文共 1918 字，大约阅读时间需要 6 分钟。

1.去教育部官网下载excel数据：http://www.moe.gov.cn/srcsite/A03/moe_634/201706/W020170616379651135432.xls

2.把xls数据转换成json格式

[root@do1_qy_10479 opt]# cat  just4json.py# encoding: utf-8import xlrdfrom collections import OrderedDictimport jsonimport codecswb = xlrd.open_workbook('aa.xls')convert_list = []#sh ==> sheetsh = wb.sheet_by_index(0)title = sh.row_values(2)for rownum in range(4,sh.nrows):    rowvalue = sh.row_values(rownum)    single = OrderedDict()    for colnum in range(0,len(rowvalue)):        print(title[colnum],rowvalue[colnum])        single[title[colnum]] = rowvalue[colnum]    convert_list.append(single)j = json.dumps(convert_list,ensure_ascii=False)with codecs.open('tojson.json',"w","utf-8") as f:    f.write(j)

3.得到的json数据

[{
   "rowid": 2631.0, "name": "新疆工业职业技术学院", "code": "4265051060", "charge": "新疆维吾尔自治区", "location": "乌鲁木齐市", "level": "专科", "remark": ""}]

4.把json格式的数据倒进elastic search

#python 3.6# -*- coding:utf-8 -*-__author__ = 'BH8ANK'import jsonimport os#文件预处理a = open(r"/opt/englishjson.json", "r",encoding='UTF-8')out = a.read()tmp = json.dumps(out)tmp = json.loads(out)#构造curl语句上传数据num = len(tmp)i = 0while i < num:    data = json.dumps(tmp[i],ensure_ascii=False)    i = str(i)    curl_word_1 = """ curl -XPUT "http://172.31.250.16:9200/daxue04/_doc/""" + i       #此处设置ES的IP:PORT    curl_word_2 =  '''" -H 'Content-Type: application/json' -d'''    curl_word_3 = "'" + data + "'"    curl_words = curl_word_1 + curl_word_2 + curl_word_3    os.system(curl_words)    print(curl_words)    i = int(i)    i = i + 1

5.查询es的数据

GET daxue05/_doc/0{  "_index" : "daxue05",  "_type" : "_doc",  "_id" : "0",  "_version" : 1,  "_seq_no" : 0,  "_primary_term" : 1,  "found" : true,  "_source" : {    "rowid" : 1.0,    "name" : "北京大学",    "code" : "4111010001",    "charge" : "教育部",    "location" : "北京市",    "level" : "本科",    "remark" : ""  }}

6.参考：