# 第十一章数据可视化之数据获取

# 第一节 Python爬虫基础

# 1.请求获取URL网页数据

import requests
url='http://music.163.com/#/artist?id=6697'
r=requests.get(url).text
print(r)

r = requests.post("http://httpbin.org/post")
r = requests.put("http://httpbin.org/put")
r = requests.delete("http://httpbin.org/delete")
r = requests.head("http://httpbin.org/get")
r = requests.options("http://httpbin.org/get")

# 2.请求一个W3C页面数据

import requests

keyword ='html_image'
try:
    keyword={'f':keyword}
    r = requests.get("http://www.w3school.com.cn/tiy/t.asp",params = keyword)
    print(r.url)
    print(r.text)
except:
    print("error")

# 3.请求图片数据

import requests

keyword ='html_image'
print(keyword)
try:
    keyword={'f':keyword}
    r = requests.get("http://www.w3school.com.cn//i/eg_mouse.jpg")
    print(r.url)
    print(r.content)
    f = open('w3c.jpg', 'wb')
    f.write(r.content)
    f.close()
except:
    print("error")

# 第二节解析数据

# 1.使用BS4解析数据

from bs4 import BeautifulSoup
import requests
try:
    r=requests.get("http://www.w3school.com.cn/tiy/t.asp?f=jseg_text")
    soup = BeautifulSoup(r.text,'html.parser')
    href=soup.find_all('a')
    print(href)
except:
    print("error")

# 2.正则表达式解析数据

import re
str='data=["中国","美国","日本","英国"];city=["广州","北京","上海","深圳"];'
result = re.match(r'data=(.*?);',str)
print(result.group(1))

# 第三节数据存储与知识图谱

# 1.数据存入TXT文件

data=["名称最新价涨跌幅",
      "永安行 38.66 43.99%",
      "秦港股份 3.71 10.09%",
      "中船科技 18.54 10.03%"]
with open('data.txt','w+',encoding='utf-8') as f:
	for d in data:
		f.writelines(d+'\n')

# 2.数据存入MySQL

import pymysql

data=[['永安行', '38.66', '43.99%'],
      ['秦港股份', '3.71', '10.09%'],
      ['中船科技', '18.54', '10.03%']]

# pymysql.connect(数据库url,用户名,密码,数据库名 )
db = pymysql.connect("localhost", "root", "123456", "SpiderDB", charset = 'utf8')
cursor = db.cursor()
try:
    for d in data:
        cursor.execute("INSERT INTO stock(name,price,rise) VALUES(%s,%s,%s)", (d[0], d[1], d[2]))
        print("ok")
        db.commit()
except:
    db.rollback()
db.close()

# 3.爬取WikiPedia数据

# -*- coding: utf-8 -*
import requests
from bs4 import BeautifulSoup
import json
import re
import queue
class leaf:
    def __init__(self,name,brief,lid,children,url):
        self.name=name
        self.brief=brief
        self.id=lid
        self.children=children
        self.url=url

keywordDict=[]
nodes=[]
links=[]
urllist=queue.Queue()
depth=3
leafMax=3
baseUrl="https://en.wikipedia.org"
Default_Header={
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding':'gzip, deflate, br',
'accept-language':'zh-CN,zh;q=0.8',
'cache-control':'max-age=0',
'cookie':'CP=H2; GeoIP=JP:13:Tokyo:35.64:139.77:v4; WMF-Last-Access=12-Aug-2017; WMF-Last-Access-Global=12-Aug-2017; TBLkisOn=0',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'
}
_session = requests.session()
#requests.adapters.DEFAULT_RETRIES = 3
_session.headers.update(Default_Header)
'''_session.keep_alive = False'''
def addKeyword(word):
    if word in keywordDict:
        for i in range(0,len(keywordDict)):
            if word==keywordDict[i]:
                return i
    else:
        keywordDict.append(word)
        return len(keywordDict)-1

def isNew(id):
    if id<len(keywordDict)-1:
        return False
    else:
        return True

def getHTMLText(url):
    try:
        r = _session.get(url)
        r.raise_for_status()
        r.encoding = "UTF-8"
        return r.text
    except:
        return ""

def wideLeaf(html,level,fid):
    try:
        soup = BeautifulSoup(html, 'lxml')
        name = str(soup.find('h1', attrs={'id': 'firstHeading'}).string)
        lid = addKeyword(name)
        print(name + str(len(keywordDict)) + str(isNew(lid)))
        if isNew(lid):
            nodes.append({"name": name, "group": lid})
            contentHtml = soup.find('div', attrs={'class': 'mw-parser-output'})
            birefHtml = contentHtml.find_all('p', recursive=False)
            count = 0
            for tag in birefHtml:
                strings = re.findall(r'>.*?<', str(tag))
                sentence = re.sub(r'[>|<]', '', ''.join(strings))
                # 简介终止
                if sentence == '':
                    break
                for a in tag.find_all('a', href=re.compile('/wiki/')):
                    urllist.put([a.attrs['href'], level + 1, lid])
                    count += 1
                    if count > leafMax: break

        if fid>-1:
            links.append({"source":fid,"target":lid,"value":1})
    except Exception as e:
        print(e)
        return
#0:url 1:level 2:fatherId
def wideWiki(start_url,dep):
    wideLeaf(getHTMLText(start_url),1,-1)
    while(True):
        if urllist.empty():
            break
        leaf=urllist.get()
        print(str(leaf)+str(urllist.qsize()))
        if leaf[1]==dep+1:
            break
        wideLeaf(getHTMLText(baseUrl +leaf[0]),leaf[1],leaf[2])




def main():
    keyword = 'JavaScript'
    start_url = 'https://en.wikipedia.org/wiki/' + keyword
    wideWiki(start_url,depth)
    with open('wikidata.json','w+',encoding='utf-8') as f:
        f.write("nodes="+json.dumps(nodes,ensure_ascii=False))
        f.write(";\nlinks="+json.dumps(links,ensure_ascii=False))

if __name__=="__main__":
    main()
    pass

# 4.WikiPedia词条知识图谱

← 第十章 Canvas 与图像处理第十二章分词与词云图可视化 →

# 第十一章 数据可视化之数据获取

# 第一节 Python爬虫基础

# 1.请求获取URL网页数据

# 2.请求一个W3C页面数据

# 3.请求图片数据

# 第二节 解析数据

# 1.使用BS4解析数据

# 2.正则表达式解析数据

# 第三节 数据存储与知识图谱

# 1.数据存入TXT文件

# 2.数据存入MySQL

# 3.爬取WikiPedia数据

# 4.WikiPedia词条知识图谱

# 第十一章数据可视化之数据获取

# 第二节解析数据

# 第三节数据存储与知识图谱