크롤링 하여 csv 에 저장하기
import os
import csv
import requests
from bs4 import BeautifulSoup
os.system("clear")
alba_url = "http://www.alba.co.kr"
def alldata(datalist):
spacelist = datalist.find_all("td",{"class":"local first"})
titlelist = datalist.find_all("td",{"class":"title"})
timedatalist = datalist.find_all("td",{"class":"data"})
paylist = datalist.find_all("td",{"class":"pay"})
regdatelist = datalist.find_all("td",{"class":"regDate last"})
bigdatalist =[]
spacedatalist = []
titledatalist = []
timedatadatalist = []
paydatalist = []
regdatedatalist = []
for space in spacelist:
spacedatalist.append(space.text)
for title in titlelist:
titledatalist.append(title.find("span").text)
for timedata in timedatalist:
timedatadatalist.append(timedata.text)
for pay in paylist:
moneylist = pay.find_all("span")
moneyback = ""
for money in moneylist:
moneyback += money.text
paydatalist.append(moneyback)
for regdate in regdatelist:
regdatedatalist.append(regdate.text)
for gogo in range(len(spacelist)):
smalldata = []
smalldata.append(spacedatalist[gogo].replace('\xa0', ' '))
smalldata.append(titledatalist[gogo])
smalldata.append(timedatadatalist[gogo])
smalldata.append(paydatalist[gogo])
smalldata.append(regdatedatalist[gogo])
bigdatalist.append(smalldata)
return bigdatalist
def workjob(work_url):
result = requests.get(work_url)
soup = BeautifulSoup(result.text,"html.parser")
datalist = soup.find("div",{"class":"goodsList goodsJob"}).find("table",{"cellspacing":{"0"}}).find("tbody")
try:
return alldata(datalist)
except:
pass
def insidejob(inside_url,company):
print(inside_url)
result = requests.get(inside_url)
soup = BeautifulSoup(result.text,"html.parser")
listcount = int(soup.find("div",{"id":"NormalInfo"}).find("p").find("strong").text.replace(",",""))
bigdataforcvs = []
if listcount % 50 == 0:
pagecount = int(listcount/50)
else :
pagecount = 1+ int(listcount/50)
for i in range(1,pagecount+1):
try:
pageurl = f"{inside_url}job/brand/?page={i}&pagesize=50&areacd=&workaddr1=&workaddr2=&jobkind=&jobkindsub=&jobkindmulti=&gendercd=&agelimitcd=&agelimit=0&worktime=&weekdays=&searchterm=&paycd=&paystart=&payend=&workperiodcd=&workstartdt=&workenddt=&workchkyn=&workweekcd=&targetcd=&streetunicd=&streetstationcd=&unicd=&schnm=&schtext=&orderby=freeorder&acceptmethod=&eleccontract=&careercd=%20&lastschoolcd=&welfarecd=&careercdunrelated=&lastschoolcdunrelated=&strAreaMulti=&genderunrelated=&special=&hiretypecd=&totalCount={listcount}"
bigdataforcvs.append(workjob(pageurl))
print(f"크롤링 중 Page({i}/{pagecount})")
except:
pageurl = f"{inside_url}?page={i}&pagesize=50&areacd=&workaddr1=&workaddr2=&jobkind=&jobkindsub=&jobkindmulti=&gendercd=&agelimitcd=&agelimit=0&worktime=&weekdays=&searchterm=&paycd=&paystart=&payend=&workperiodcd=&workstartdt=&workenddt=&workchkyn=&workweekcd=&targetcd=&streetunicd=&streetstationcd=&unicd=&schnm=providercd&schtext=BP4,BP5,BP6,BP7&orderby=freeorder&acceptmethod=&eleccontract=&totalCount={listcount}&viewtype="
bigdataforcvs.append(workjob(pageurl))
print(f"크롤링 중 Page({i}/{pagecount})")
save(bigdataforcvs,company)
def save(bigdataforcvs,company):
file = open(f"day8/csv/{company}.csv", mode="w")
writer = csv.writer(file)
writer.writerow(["place","title","time","pay","data"])
for pages in bigdataforcvs:
for jobs in pages:
writer.writerow(jobs)
return
def startgame():
result = requests.get(alba_url)
soup = BeautifulSoup(result.text,"html.parser")
inside = soup.find('div',{"id":"MainSuperBrand"}).find("ul",{"class":"goodsBox"}).find_all("a", {"class":"goodsBox-info"})
for joburl in inside:
company=joburl.find("span",{"class":"company"}).text
print(company)
insidejob(joburl["href"],company)
print("csv 저장완료")
생각보다 코드가 길지만 숙련도가 늘으면 깔끔하게 정리되지 않을까?
이런식으로 프린트를 해주어 얼마나 다운 받고 찾았나 알 수 있다
파일들은 회사이름으로 동기화 했당
'nomadcoders 공부 심화' 카테고리의 다른 글
2주 첼린지~ 마치며(nomadcoders) (0) | 2020.09.07 |
---|---|
2주 첼린지~ 9~10일차 (nomadcoders) (0) | 2020.09.07 |
2주 첼린지~ 7일차 (nomadcoders) (0) | 2020.08.30 |
2주 첼린지~ 6일차 (nomadcoders) (0) | 2020.08.30 |
2주 첼린지~ 5일차 (nomadcoders) (0) | 2020.08.30 |