Skip to content

Commit b3f476a

Browse files
authored
Merge pull request #603 from Yuanzjls/issue-411
add yelp image download
2 parents 013b90c + 4e31459 commit b3f476a

2 files changed

Lines changed: 71 additions & 13 deletions

File tree

importers/yelp.com/download_html.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
14
import os
25
import lxml.html as html
3-
import urllib2
6+
import urllib.request, urllib.error, urllib.parse
47
import re
58

69

@@ -26,9 +29,9 @@ def get_href_without_ad_redirect(href):
2629
href = href[index + len(start_token):]
2730
if '&' in href:
2831
index = href.index('&')
29-
return urllib2.unquote(href[:index])
32+
return urllib.parse.unquote(href[:index])
3033
else:
31-
return urllib2.unquote(href)
34+
return urllib.parse.unquote(href)
3235

3336
return href
3437

@@ -55,14 +58,14 @@ def download_location_detail_page(url):
5558
name = get_location_name_from_url(url)
5659
output_file_name = 'data/raw_html_'+ name +'.html'
5760
if not os.path.exists(output_file_name):
58-
print ('Downloading details for location: ' + name)
59-
response = urllib2.urlopen(url)
61+
print('Downloading details for location: ' + name)
62+
response = urllib.request.urlopen(url)
6063
html = response.read() # returns all the lines in a file.
61-
with open(output_file_name, 'w') as html_file:
64+
with open(output_file_name, 'wb') as html_file:
6265
html_file.write(html)
6366
html_file.close()
6467
else:
65-
print ('Already have details for location: ' + name)
68+
print('Already have details for location: ' + name)
6669

6770
return output_file_name
6871

@@ -97,8 +100,8 @@ def auto_page_download(number_of_pages, find_query, location):
97100

98101
for i in range(number_of_pages):
99102
page_results = i * 10
100-
search_yelp = (yelp_base_url + 'search?find_desc=' + urllib2.quote(find_query) + '&find_loc='
101-
+ urllib2.quote(location) )
103+
search_yelp = (yelp_base_url + 'search?find_desc=' + urllib.parse.quote(find_query) + '&find_loc='
104+
+ urllib.parse.quote(location))
102105
if page_results > 0:
103106
search_yelp = search_yelp + '&start=' + str(page_results)
104107

@@ -107,13 +110,13 @@ def auto_page_download(number_of_pages, find_query, location):
107110
filename = "data/page_" + filename_encode(location) + "_" + filename_encode(find_query) + "_{}.html".format(i)
108111

109112
if not os.path.exists(filename):
110-
page = urllib2.urlopen(search_yelp) # url specific to
113+
page = urllib.request.urlopen(search_yelp) # url specific to
111114
html = page.read() # returns all the lines in a file.
112-
print ('Generating HTML file: ' + str(filename))
113-
with open(filename, 'w') as html_file:
115+
print('Generating HTML file: ' + str(filename))
116+
with open(filename, 'wb') as html_file:
114117
html_file.write(html)
115118
else:
116-
with open(filename, 'r') as f:
119+
with open(filename, 'r', encoding='utf-8') as f:
117120
html = f.read()
118121
f.close()
119122

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
#To do: video download, fold name improment, url improvement
4+
import os
5+
import lxml.html as html
6+
import urllib.request, urllib.error, urllib.parse
7+
import re
8+
import time
9+
10+
url_img = 'biz_photos/burma-superstar-san-francisco-2?select_video=7sT6xHsZQAEP8H3pgwlhlg'
11+
url_yelp = 'https://www.yelp.com/'
12+
def save_image(url, filename):
13+
page = urllib.request.urlopen(url)
14+
img_x = page.read()
15+
with open(filename, 'wb') as html_file:
16+
html_file.write(img_x);
17+
18+
def image_save_url(url):
19+
page = urllib.request.urlopen(url)
20+
bhtml = page.read()
21+
root = html.fromstring(bhtml)
22+
Imag_urls = root.cssselect('img.photo-box-img')
23+
Imag_url = Imag_urls[0].xpath('@src')[0]
24+
folder = (root.xpath("//ul[@class='breadcrumbs']/li/a/text()"))[0]
25+
filename = '/' + urllib.parse.urlparse(Imag_url).path.split('/')[2] + os.path.splitext(Imag_url)[1];
26+
if not os.path.exists('data/'+ folder):
27+
os.makedirs('data/'+ folder)
28+
save_image(Imag_url, 'data/'+ folder + filename)
29+
30+
31+
def find_next_url(url):
32+
page = urllib.request.urlopen(url)
33+
bhtml = page.read()
34+
root = html.fromstring(bhtml)
35+
next_url = root.xpath("//a[contains(@class, 'js-media-nav_link--next')]/@href")
36+
return next_url[0]
37+
38+
if __name__ == '__main__':
39+
next_url = url_img;
40+
query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
41+
while 'video' in list(query_dic)[0]:
42+
next_url = find_next_url(url_yelp + next_url)
43+
query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
44+
image_save_url(url_yelp+next_url)
45+
next_url = find_next_url(url_yelp+next_url)
46+
count = 0;
47+
while(next_url and count < 9):
48+
image_save_url(url_yelp + next_url)
49+
next_url = find_next_url(url_yelp + next_url)
50+
query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
51+
while 'video' in list(query_dic)[0]:
52+
next_url = find_next_url(url_yelp + next_url)
53+
query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
54+
count = count + 1
55+
time.sleep(0.3)

0 commit comments

Comments
 (0)