Python 爬虫

python爬虫爬取Tiobe编程语言top20写入excel

背景

excel表格的应用频率在工作生活中都很高,碰到大量手工录入的情景就会让人非常厌烦,这时就可以通过一些脚本来自动化这些过程。我这里爬取Tiobe上的top20做一个小demo,熟悉一下库的基本用法。

第三方库

python爬虫的部分是很简单的,不涉及抓包也不需要正则,就利用requests和beautifulsoup就可以轻松解析出信息。 重点在于操作excel,python有xlrd和xlwt,看名字就知道xlrd是读取表格,xlwt是写入表格。

实现思路

爬取网页中表格,用dict存每种编程语言的信息(感觉自己深受js对象的影响),然后list存20个dict。

具体实现

首先是爬虫部分,比较简单就不解释了

class tiobe_Spider:

    def __init__(self):
        self.data = []
        self.url = 'https://www.tiobe.com/tiobe-index/'
        self.headers = {}

    def getRank(self):
        self.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'
        res = requests.get(self.url, headers=self.headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        table = soup.find('table', class_="table table-striped table-top20").find_all('td')
        i = 0
        while i < len(table):
            obj = {} 
            obj['rank'] = table[i].text
            obj['oldrank'] = table[i+1].text
            obj['name'] = table[i+3].text
            obj['ratings'] = table[i+4].text
            obj['change'] = table[i+5].text
            self.data.append(obj)
            i += 6

        return self.data

写入excel部分用的是xlwt库,介绍几个基本方法:

workbook = xlwt.Workboot(encoding='ascii') #创建一个workbook也就是xlsx,并用ascii编码
sheet1 = workbook.add_sheet('sheet1') #在workbook中创建一个名为sheet1的表格
sheet1.write(row, col, lable="1") #在表sheet1的第row行第col列写入1
class excel_Handler:

    def __init__(self):
        self.workbook = xlwt.Workbook(encoding = 'ascii')
        self.table = self.workbook.add_sheet('tioberank')

    def writeToExcle(self, data):
        ctype = 1
        xf = 0
        self.table.write(0, 0, label='rank')
        self.table.write(0, 1, label='oldrank')
        self.table.write(0, 2, label='name')
        self.table.write(0, 3, label='ratings')
        self.table.write(0, 4, label='change')
        for i in range(20):
            self.table.write(i+1, 0, label=data[i]['rank'])
            self.table.write(i+1, 1, label=data[i]['oldrank'])
            self.table.write(i+1, 2, label=data[i]['name'])
            self.table.write(i+1, 3, label=data[i]['ratings'])
            self.table.write(i+1, 4, label=data[i]['change'])
        self.workbook.save('./tioberank.xlsx')
        print('data has writen in tioberank.xlsx')

完整代码

#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
import xlwt

# data = [obj1, obj2...obj10]
# obj = {
#   rank: INT,
#   oldrank: INT,
#   name: STRING,
#   ratings: STRING,
#   change: STRING
# }

class tiobe_Spider:

    def __init__(self):
        self.data = []
        self.url = 'https://www.tiobe.com/tiobe-index/'
        self.headers = {}

    def getRank(self):
        self.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'
        res = requests.get(self.url, headers=self.headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        table = soup.find('table', class_="table table-striped table-top20").find_all('td')
        i = 0
        while i < len(table):
            obj = {} 
            obj['rank'] = table[i].text
            obj['oldrank'] = table[i+1].text
            obj['name'] = table[i+3].text
            obj['ratings'] = table[i+4].text
            obj['change'] = table[i+5].text
            self.data.append(obj)
            i += 6

        return self.data

class excel_Handler:

    def __init__(self):
        self.workbook = xlwt.Workbook(encoding = 'ascii')
        self.table = self.workbook.add_sheet('tioberank')

    def writeToExcle(self, data):
        ctype = 1
        xf = 0
        self.table.write(0, 0, label='rank')
        self.table.write(0, 1, label='oldrank')
        self.table.write(0, 2, label='name')
        self.table.write(0, 3, label='ratings')
        self.table.write(0, 4, label='change')
        for i in range(20):
            self.table.write(i+1, 0, label=data[i]['rank'])
            self.table.write(i+1, 1, label=data[i]['oldrank'])
            self.table.write(i+1, 2, label=data[i]['name'])
            self.table.write(i+1, 3, label=data[i]['ratings'])
            self.table.write(i+1, 4, label=data[i]['change'])
        self.workbook.save('./tioberank.xlsx')
        print('data has writen in tioberank.xlsx') 

def main():
    spider = tiobe_Spider()
    handler = excel_Handler()
    handler.writeToExcle(spider.getRank())

if __name__ == '__main__':
    main()

写入前后

发表评论

电子邮件地址不会被公开。 必填项已用*标注