'nomadCoder'에 해당되는 글 2건

크롤링 하여 csv 에 저장하기

import os
import csv
import requests
from bs4 import BeautifulSoup 

os.system("clear")
alba_url = "http://www.alba.co.kr"


def alldata(datalist):
  spacelist = datalist.find_all("td",{"class":"local first"})
  titlelist = datalist.find_all("td",{"class":"title"})
  timedatalist = datalist.find_all("td",{"class":"data"})
  paylist = datalist.find_all("td",{"class":"pay"})
  regdatelist = datalist.find_all("td",{"class":"regDate last"})
  
  bigdatalist =[]
  spacedatalist = []
  titledatalist = []
  timedatadatalist = []
  paydatalist = []
  regdatedatalist = []
  for space in spacelist:
    spacedatalist.append(space.text)

  for title in titlelist:
    titledatalist.append(title.find("span").text)
  
  for timedata in timedatalist:
    timedatadatalist.append(timedata.text)  
    
  for pay in paylist:
    moneylist = pay.find_all("span")
    moneyback = ""
    for money in moneylist:
      moneyback += money.text
    paydatalist.append(moneyback)
    
  for regdate in regdatelist:
    regdatedatalist.append(regdate.text)
    
  for gogo in range(len(spacelist)):
    smalldata = []
    smalldata.append(spacedatalist[gogo].replace('\xa0', ' '))
    smalldata.append(titledatalist[gogo])
    smalldata.append(timedatadatalist[gogo])
    smalldata.append(paydatalist[gogo])
    smalldata.append(regdatedatalist[gogo])
    bigdatalist.append(smalldata)
  return bigdatalist


def workjob(work_url):
  result = requests.get(work_url)
  soup = BeautifulSoup(result.text,"html.parser")
  datalist = soup.find("div",{"class":"goodsList goodsJob"}).find("table",{"cellspacing":{"0"}}).find("tbody")  
  
  try:
    return alldata(datalist)    
  except:
    pass

def insidejob(inside_url,company):
  print(inside_url)
  result = requests.get(inside_url)
  soup = BeautifulSoup(result.text,"html.parser")
  listcount = int(soup.find("div",{"id":"NormalInfo"}).find("p").find("strong").text.replace(",",""))
  bigdataforcvs = []
  if listcount % 50 == 0:
    pagecount = int(listcount/50)
  else :
    pagecount = 1+ int(listcount/50)
  for i in range(1,pagecount+1):
    try:
      pageurl = f"{inside_url}job/brand/?page={i}&pagesize=50&areacd=&workaddr1=&workaddr2=&jobkind=&jobkindsub=&jobkindmulti=&gendercd=&agelimitcd=&agelimit=0&worktime=&weekdays=&searchterm=&paycd=&paystart=&payend=&workperiodcd=&workstartdt=&workenddt=&workchkyn=&workweekcd=&targetcd=&streetunicd=&streetstationcd=&unicd=&schnm=&schtext=&orderby=freeorder&acceptmethod=&eleccontract=&careercd=%20&lastschoolcd=&welfarecd=&careercdunrelated=&lastschoolcdunrelated=&strAreaMulti=&genderunrelated=&special=&hiretypecd=&totalCount={listcount}"
      bigdataforcvs.append(workjob(pageurl))
      print(f"크롤링 중 Page({i}/{pagecount})")
    except:
      pageurl = f"{inside_url}?page={i}&pagesize=50&areacd=&workaddr1=&workaddr2=&jobkind=&jobkindsub=&jobkindmulti=&gendercd=&agelimitcd=&agelimit=0&worktime=&weekdays=&searchterm=&paycd=&paystart=&payend=&workperiodcd=&workstartdt=&workenddt=&workchkyn=&workweekcd=&targetcd=&streetunicd=&streetstationcd=&unicd=&schnm=providercd&schtext=BP4,BP5,BP6,BP7&orderby=freeorder&acceptmethod=&eleccontract=&totalCount={listcount}&viewtype="
      bigdataforcvs.append(workjob(pageurl))
      print(f"크롤링 중 Page({i}/{pagecount})")

  
    save(bigdataforcvs,company)


    
def save(bigdataforcvs,company):
  file = open(f"day8/csv/{company}.csv", mode="w")
  writer = csv.writer(file)
  writer.writerow(["place","title","time","pay","data"])

  for pages in bigdataforcvs:
    for jobs in pages:
      writer.writerow(jobs)
  return

def startgame():
  result = requests.get(alba_url)
  soup = BeautifulSoup(result.text,"html.parser")
  inside = soup.find('div',{"id":"MainSuperBrand"}).find("ul",{"class":"goodsBox"}).find_all("a", {"class":"goodsBox-info"})
  for joburl in inside:
    company=joburl.find("span",{"class":"company"}).text
    print(company)
    insidejob(joburl["href"],company)
    print("csv 저장완료")

생각보다 코드가 길지만 숙련도가 늘으면 깔끔하게 정리되지 않을까?

이런식으로 프린트를 해주어 얼마나 다운 받고 찾았나 알 수 있다

파일들은 회사이름으로 동기화 했당

블로그 이미지

Or71nH

,

쉬는날~~ 
그래서 동적 크롤링을 구연해봣다


자바 스크립트 inputtext 안에 있는 결과값 가져오기~!!!!



import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
print(1)

driver = webdriver.Chrome(options=chrome_options)
print(2)
driver.get("https://transferwise.com/gb/currency-converter/krw-to-usd-rate?amount=50")
print(3)
tag = driver.find_element_by_xpath("//input[@id='cc-amount-to']")
print(tag)
print (tag.get_attribute('value'))
print(4)

 

 

 

 

 

 

이렇게 있는 데
여기서 저 구동괸 결과값이 "value  data-hj-whitelist"
움 안보임.. 
정적으로는 불가능 js 구동후 값이 value에 들어감
그래서 

동적 코딩 시작

설치부터 
https://webnautes.tistory.com/1184

그런데 문제가 생겻다!!

엉?~??
sudo  가 안먹힌다 T^T 

정말 열심히 찾다가 
https://go-madhat.github.io/chrome-headless/

from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome("./chromedriver", chrome_options = chrome_options)

driver.get('http://google.com')

driver.quit()


repl에 있는 chrome 드라이버를 자동 할당하여 사용하는 코드를 찾았다!! 
이렇게 저렇게 하다보니 

import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
print(1)

driver = webdriver.Chrome(options=chrome_options)
print(2)
driver.get("https://transferwise.com/gb/currency-converter/krw-to-usd-rate?amount=50")
print(3)

오오ㅗㅇ~!!!! 

되기 시작!!~!~!
드디어 repl안에서~!! 동적 크롤링 성공~!~!!~
오오옹오~!!~

문제는 이것이엿다
아!!!!!! 이거 벨류 어케 가져와?~!??~!?~!

난 별의별짓을 다하면서

computer : 놉~!

computer : 놉~!

computer : 옛다~! 
나 : ??????????????

computer : 놉~!

으으ㅏ아아아앙~!!~!~
그러던중

엇!@@!!@@!@!

 

오ㅗ오오오~!~!!~~!!~!ㅇ오오오오~!~!!~!~
됫어~!!~~!
이렇게 하게 되었다~!

블로그 이미지

Or71nH

,