Web Scraping Github Python

Web Scraping com Python e BeautifulSoup. GitHub Gist: instantly share code, notes, and snippets. Requests# Well known library for most of the Python developers as a fundamental tool to get raw.

Loading Web Pages with 'request' The requests module allows you to send HTTP requests using.
This is a step-by-step hands-on tutorial explaining how to scrape websites for information. PROTIP: If an API is not available, scrape (extract/mine) specific information by parsing HTML from websites using the Scrapy web scraping (Spider) framework. Inside a virtual environment.

scraper.py

#! /usr/bin/env python

importos, sys, dropbox

frombs4importBeautifulSoup

fromurllib.requestimporturlopen

errorStrings= []

defformulateErrorStrings(tin):

errorStrings.append('Dealer Not Found for the entered TIN '+tin)

errorStrings.append('Sorry, State corresponding to TIN not activated in TINXSYS')

defget_TIN_value():

tinValue=input('Enter TIN value: ')

returntinValue

defform_TIN_url(val):

url= ('http://www.tinxsys.com/TinxsysInternetWeb/dealerControllerServlet?'+

'tinNumber='+val+'&searchBy=TIN&backPage=searchByTin_Inter.jsp')

returnurl

defcheckValidUrl(url):

valid=True

page=urlopen(url)

soup=BeautifulSoup(page.read())

forstringinsoup.stripped_strings:

try:

iferrorStrings.index(string):

valid=False

exceptValueError:

continue

returnvalid

defauthenticateToDropbox():

app_key=os.getenv('DROPBOX_KEY')

app_secret=os.getenv('DROPBOX_SECRET')

flow=dropbox.client.DropboxOAuth2FlowNoRedirect(app_key, app_secret)

authUrl=flow.start()

print('Authorize the application from : '+authUrl)

authCode=input('Enter authorization code: ').strip()

accessToken, userId=flow.finish(authCode)

client=dropbox.client.DropboxClient(accessToken)

print('Linked account: ', client.account_info())

defuploadToDropbox():

authenticateToDropbox()

f=open('scrape.txt', 'rb')

tStamp=os.path.getmtime('scrape.txt')

response=client.put_file(str(tStamp) +'/scrape.txt', f)

print('Uploaded: ', response)

defmain():

tin=get_TIN_value()

formulateErrorStrings(tin)

ifcheckValidUrl(form_TIN_url(tin)):

uploadToDropbox()

else:

print(tin+' not found')

sys.exit()

if__name__'__main__' : main()

imdb.py

Web Scraper Github

Web Scraping Python Beautifulsoup Github

frombs4importBeautifulSoup

importrequests

importre

# Download IMDB's Top 250 data

url='http://www.imdb.com/chart/top'

response=requests.get(url)

soup=BeautifulSoup(response.text, 'lxml')

movies=soup.select('td.titleColumn')

links= [a.attrs.get('href') forainsoup.select('td.titleColumn a')]

crew= [a.attrs.get('title') forainsoup.select('td.titleColumn a')]

ratings= [b.attrs.get('data-value') forbinsoup.select('td.posterColumn span[name=ir]')]

votes= [b.attrs.get('data-value') forbinsoup.select('td.ratingColumn strong')]

imdb= []

# Store each item into dictionary (data), then put those into a list (imdb)

forindexinrange(0, len(movies)):

# Seperate movie into: 'place', 'title', 'year'

movie_string=movies[index].get_text()

movie= (' '.join(movie_string.split()).replace('.', '))

movie_title=movie[len(str(index))+1:-7]

year=re.search('((.*?))', movie_string).group(1)

place=movie[:len(str(index))-(len(movie))]

data= {'movie_title': movie_title,

'year': year,

'place': place,

'star_cast': crew[index],

'rating': ratings[index],

'vote': votes[index],

'link': links[index]}

imdb.append(data)

foriteminimdb:

print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast'])

Hunthunter732