import requests from bs4 import BeautifulSoup import datetime import re import pymysql import datetime #数据库封装 class Mydb(): def __init__(self): try: self.conn = pymysql.connect('127.0.0.1','root','123456','py11',charset='utf8') self.cursor = self.conn.cursor() except Exception as e: print(e) def execute(self,sql,data): try: res = self.cursor.execute(sql,data) self.conn.commit() except Exception as e: self.conn.rollback() print(e) base_url = 'http://hr.tencent.com/position.php?start=%d' headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } # 处理详情页 def parse_detail(url): response = requests.get(url,headers=headers) html = BeautifulSoup(response.text,'lxml') # 职位标题 postion_name = html.select('tr[class="h"]')[0].text.strip() # 工作地点 info = html.select('table.tablelist tr') location = info[1].select('td')[0].contents[-1] p_type = info[1].select('td')[1].contents[-1] p_number = info[1].select('td')[2].contents[-1].strip('人') # 工作职责 duty_list = info[2].select('li') duty_list = [duty.text for duty in duty_list] duty = ''.join(duty_list) # 工作要求 requirement = info[3].select('li') requirement = [require.text for require in requirement] requirement = ''.join(requirement) # 获取系统时间 crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 获取url_id id_pat = re.compile(r'id=(\d+)') res = id_pat.search(response.url) url_id = res.group(1) # 保存数据 sql = 'insert into ceshi(url_id,position_name,location,p_type,p_number,duty,requirement,crawl_time) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) ' \ 'on duplicate key update position_name=values(position_name)' data = [url_id,postion_name,location,p_type,p_number,duty,requirement,crawl_time] print(postion_name) mydb.execute(sql,data) def getPage(): for i in range(0,2920 + 1,10): fullurl = base_url % i response = requests.get(fullurl,headers=headers) html = response.text # 获取详情页链接地址 html = BeautifulSoup(html,'lxml') tr_list = html.select('table.tablelist tr')[1:-1] for tr in tr_list: detail_link = tr.select('td > a')[0].get('href') detail_link = 'http://hr.tencent.com/' + detail_link # 发起详情页请求 parse_detail(detail_link) if __name__ == '__main__': mydb = Mydb() getPage()