1+ #!/usr/bin/python
2+ # -*- coding: utf-8 -*-
3+
14import os
25import lxml .html as html
3- import urllib2
6+ import urllib . request , urllib . error , urllib . parse
47import re
58
69
@@ -26,9 +29,9 @@ def get_href_without_ad_redirect(href):
2629 href = href [index + len (start_token ):]
2730 if '&' in href :
2831 index = href .index ('&' )
29- return urllib2 .unquote (href [:index ])
32+ return urllib . parse .unquote (href [:index ])
3033 else :
31- return urllib2 .unquote (href )
34+ return urllib . parse .unquote (href )
3235
3336 return href
3437
@@ -55,14 +58,14 @@ def download_location_detail_page(url):
5558 name = get_location_name_from_url (url )
5659 output_file_name = 'data/raw_html_' + name + '.html'
5760 if not os .path .exists (output_file_name ):
58- print ('Downloading details for location: ' + name )
59- response = urllib2 .urlopen (url )
61+ print ('Downloading details for location: ' + name )
62+ response = urllib . request .urlopen (url )
6063 html = response .read () # returns all the lines in a file.
61- with open (output_file_name , 'w ' ) as html_file :
64+ with open (output_file_name , 'wb ' ) as html_file :
6265 html_file .write (html )
6366 html_file .close ()
6467 else :
65- print ('Already have details for location: ' + name )
68+ print ('Already have details for location: ' + name )
6669
6770 return output_file_name
6871
@@ -97,8 +100,8 @@ def auto_page_download(number_of_pages, find_query, location):
97100
98101 for i in range (number_of_pages ):
99102 page_results = i * 10
100- search_yelp = (yelp_base_url + 'search?find_desc=' + urllib2 .quote (find_query ) + '&find_loc='
101- + urllib2 . quote (location ) )
103+ search_yelp = (yelp_base_url + 'search?find_desc=' + urllib . parse .quote (find_query ) + '&find_loc='
104+ + urllib . parse . quote (location ))
102105 if page_results > 0 :
103106 search_yelp = search_yelp + '&start=' + str (page_results )
104107
@@ -107,13 +110,13 @@ def auto_page_download(number_of_pages, find_query, location):
107110 filename = "data/page_" + filename_encode (location ) + "_" + filename_encode (find_query ) + "_{}.html" .format (i )
108111
109112 if not os .path .exists (filename ):
110- page = urllib2 .urlopen (search_yelp ) # url specific to
113+ page = urllib . request .urlopen (search_yelp ) # url specific to
111114 html = page .read () # returns all the lines in a file.
112- print ('Generating HTML file: ' + str (filename ))
113- with open (filename , 'w ' ) as html_file :
115+ print ('Generating HTML file: ' + str (filename ))
116+ with open (filename , 'wb ' ) as html_file :
114117 html_file .write (html )
115118 else :
116- with open (filename , 'r' ) as f :
119+ with open (filename , 'r' , encoding = 'utf-8' ) as f :
117120 html = f .read ()
118121 f .close ()
119122
0 commit comments