曼联VS利物浦直播_曼联VS利物浦免费在线高清直播_曼联VS利物浦视频在线观看无插件

2026-05-27 12:01:39

曼联VS利物浦直播_曼联VS利物浦免费在线高清直播_曼联VS利物浦视频在线观看无插件

#本文主要是为了批量爬取竞彩网赛果开奖数据,网址为:http://info.sporttery.cn/basketball/match_result.php

#而且需要的数据是每场比赛的开奖结果中的详细固定奖金

#如果手动爬取的话是个很麻烦的纯体力活,所以果断写了个爬虫来完成。

#首先不知道怎么模拟打开浏览器的请看我前面的文章:http://blog.csdn.net/trisyp/article/details/78688106

#根据那篇文章配置好了运行环境之后就可以执行本博文代码

完整代码如下:

import requests

from bs4 import BeautifulSoup

from selenium import webdriver

import os

import time

def startDriver(): #启动chrome浏览器

# 构造模拟浏览器

chromedriver = "C:/Program Files (x86)/Google/Chrome/Application/chromedriver" # 驱动所在路径

os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver) # 模拟打开浏览器

time.sleep(2)

return driver

def getDatenumber(url): #获取主页中的赛事日期和赛事编号

r = requests.get(url)

r.raise_for_status()

r.encoding = r.apparent_encoding

html = r.text

soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址

tag = soup.find_all('table', attrs={"class": {"m-tab"}})

tag1 = tag[0].find_all('tr')

dateNumber = []

for i in range(len(tag1)):

tag2 = tag1[i].find_all('td')

try:

info = tag2[0].text+tag2[1].text #获取标中的内容

except:

break

dateNumber.append(info)

del dateNumber[-1]

return dateNumber

def getHTML(driver, url, xpath): #模拟浏览器打开网页,并获得最新窗口中的网页

driver.get(url) # 打开网址

# 模拟点击更多评论

time.sleep(2)

driver.find_element_by_xpath(xpath).click()

time.sleep(3)

driver.switch_to_window(driver.window_handles[-1]) #跳转到当前窗口

html = driver.page_source

return html

def getTableName(html): #获取每个table的表名

soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址

tag = soup.find_all('div', attrs={'class': {'kj-tit'}})

tableName = []

for infoi in tag:

tableName.append(infoi.text.replace("

", "").replace(" ", ""))

return tableName

def fillUnivlist(driver, url): #保存网页中间两个表格的内容

dateNumbers = getDatenumber(url)

result = []

count = 0

for k in range(len(dateNumbers)):

xpath = "/html/body/div[4]/div[4]/table/tbody/tr["+str(k+1)+"]/td[13]/a"

html = getHTML(driver, url, xpath) # 获取HTML

tableNames = getTableName(html) #获取表名

soup = BeautifulSoup(html, 'html.parser') # 用HTML解析网址

tag = soup.find_all('table', attrs={'class': {'kj-table'}}) #获取所有表格

# print(str(tag[0]))

for i in range(1, 3):

infoTag = tag[i]

contentTr = infoTag.find_all('tr')

for j in range(len(contentTr)):

if j == 0:

contentTh = contentTr[j].find_all('th')

info1 = dateNumbers[k] + "," + tableNames[i]

for infok in contentTh:

info1 = info1 + "," + infok.text.replace(" ", "")

else:

contentTd = contentTr[j].find_all('td')

info1 = dateNumbers[k] + "," + tableNames[i]

for infok in contentTd:

info1 = info1 + "," + infok.text

result.append(info1)

count += 1

print("

当前页进度: {:.2f}%".format(count * 100 / len(dateNumbers)), end="")

return result

def writeUnivlist(result, fpath, num):

with open(fpath, 'a', encoding='utf-8') as f: #以追加的方式存储内容

for i in range(num):

f.write(result[i] + '

')

f.close()

def main():

for i in range(9):

driver = startDriver()

url = "http://info.sporttery.cn/basketball/match_result.php?page="+str(i+1)+"&start_date=2017-11-05&end_date=2017-12-05" # 要访问的网址

result = fillUnivlist(driver, url)

output_file = 'D:/page' + str(i + 1) + '.txt'

writeUnivlist(result, output_file, len(result))

driver.close()

time.sleep(2)

print("第"+str(i+1)+"页爬取完毕!")

if __name__ == '__main__':

main()

#至于Xpath的获取参考上篇博客最后一句话:http://blog.csdn.net/trisyp/article/details/78712715。