Web Scraping com Python e BeautifulSoup. GitHub Gist: instantly share code, notes, and snippets. Requests# Well known library for most of the Python developers as a fundamental tool to get raw.
- Loading Web Pages with 'request' The requests module allows you to send HTTP requests using.
- This is a step-by-step hands-on tutorial explaining how to scrape websites for information. PROTIP: If an API is not available, scrape (extract/mine) specific information by parsing HTML from websites using the Scrapy web scraping (Spider) framework. Inside a virtual environment.
scraper.py
#! /usr/bin/env python |
importos, sys, dropbox |
frombs4importBeautifulSoup |
fromurllib.requestimporturlopen |
errorStrings= [] |
defformulateErrorStrings(tin): |
errorStrings.append('Dealer Not Found for the entered TIN '+tin) |
errorStrings.append('Sorry, State corresponding to TIN not activated in TINXSYS') |
defget_TIN_value(): |
tinValue=input('Enter TIN value: ') |
returntinValue |
defform_TIN_url(val): |
url= ('http://www.tinxsys.com/TinxsysInternetWeb/dealerControllerServlet?'+ |
'tinNumber='+val+'&searchBy=TIN&backPage=searchByTin_Inter.jsp') |
returnurl |
defcheckValidUrl(url): |
valid=True |
page=urlopen(url) |
soup=BeautifulSoup(page.read()) |
forstringinsoup.stripped_strings: |
try: |
iferrorStrings.index(string): |
valid=False |
exceptValueError: |
continue |
returnvalid |
defauthenticateToDropbox(): |
app_key=os.getenv('DROPBOX_KEY') |
app_secret=os.getenv('DROPBOX_SECRET') |
flow=dropbox.client.DropboxOAuth2FlowNoRedirect(app_key, app_secret) |
authUrl=flow.start() |
print('Authorize the application from : '+authUrl) |
authCode=input('Enter authorization code: ').strip() |
accessToken, userId=flow.finish(authCode) |
client=dropbox.client.DropboxClient(accessToken) |
print('Linked account: ', client.account_info()) |
defuploadToDropbox(): |
authenticateToDropbox() |
f=open('scrape.txt', 'rb') |
tStamp=os.path.getmtime('scrape.txt') |
response=client.put_file(str(tStamp) +'/scrape.txt', f) |
print('Uploaded: ', response) |
defmain(): |
tin=get_TIN_value() |
formulateErrorStrings(tin) |
ifcheckValidUrl(form_TIN_url(tin)): |
uploadToDropbox() |
else: |
print(tin+' not found') |
sys.exit() |
if__name__'__main__' : main() |
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment
imdb.py
Web Scraper Github
Web Scraping Python Beautifulsoup Github
frombs4importBeautifulSoup |
importrequests |
importre |
# Download IMDB's Top 250 data |
url='http://www.imdb.com/chart/top' |
response=requests.get(url) |
soup=BeautifulSoup(response.text, 'lxml') |
movies=soup.select('td.titleColumn') |
links= [a.attrs.get('href') forainsoup.select('td.titleColumn a')] |
crew= [a.attrs.get('title') forainsoup.select('td.titleColumn a')] |
ratings= [b.attrs.get('data-value') forbinsoup.select('td.posterColumn span[name=ir]')] |
votes= [b.attrs.get('data-value') forbinsoup.select('td.ratingColumn strong')] |
imdb= [] |
# Store each item into dictionary (data), then put those into a list (imdb) |
forindexinrange(0, len(movies)): |
# Seperate movie into: 'place', 'title', 'year' |
movie_string=movies[index].get_text() |
movie= (' '.join(movie_string.split()).replace('.', ')) |
movie_title=movie[len(str(index))+1:-7] |
year=re.search('((.*?))', movie_string).group(1) |
place=movie[:len(str(index))-(len(movie))] |
data= {'movie_title': movie_title, |
'year': year, |
'place': place, |
'star_cast': crew[index], |
'rating': ratings[index], |
'vote': votes[index], |
'link': links[index]} |
imdb.append(data) |
foriteminimdb: |
print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast']) |
Web Scraping In Python Github
commented Jan 5, 2018
Web Scraping Github Python Tutorial
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment