pip install beautifulsoup4
pip install html5lib
pip install requests
를 추가 설치한 후에 아래의 스크립트를 돌리면 데이터 관련 책들의 추이를 볼 수 있습니다.
from bs4 import BeautifulSoup
import requests
from time import sleep
from collections import Counter
import re
def is_video(td):
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and pricelabels[0].text.strip().startswith("video"))
def book_info(td):
"""given a BeautifulSoup <td> Tag representing a book,
extract the book's details and return a dict"""
title = td.find("div", "thumbheader").a.text
by_author = td.find('div', 'AuthorName').text
authors = [x.strip() for x in re.sub("^By ", "", by_author).split(",")]
isbn_link = td.find("div", "thumbheader").a.get("href")
isbn = re.match("/product/(.*)\.do", isbn_link).groups()[0]
date = td.find("span", "directorydate").text.strip()
return {
"title" : title,
"authors" : authors,
"isbn" : isbn,
"date" : date
}
base_url = "http://shop.oreilly.com/category/browse-subjects/" + \
"data.do?sortby=publicationDate&page="
books = []
NUM_PAGES = 31
for page_num in range(1, NUM_PAGES + 1):
print("souping page", page_num, ",", len(books), " found so far")
url = base_url + str(page_num)
soup = BeautifulSoup(requests.get(url).text, "html5lib")
for td in soup("td", "thumbtext"):
if not is_video(td):
books.append(book_info(td))
import matplotlib.pyplot as plt
def get_year(book):
return int(book["date"].split()[1])
year_counts = Counter(get_year(book) for book in books if get_year(book) <= 2014)
years = sorted(year_counts)
book_counts = [year_counts[year] for year in years]
plt.bar([x - 0.5 for x in years], book_counts)
plt.xlabel("year")
plt.ylabel("# of data books")
plt.title("Data is Big!")
plt.show()
댓글 없음:
댓글 쓰기
참고: 블로그의 회원만 댓글을 작성할 수 있습니다.