Merge pull request #603 from Yuanzjls/issue-411

joshi1983 · web-flow · commit b3f476ae8d65 · 2018-07-29T16:30:36.000-04:00
add yelp image download
diff --git a/importers/yelp.com/download_html.py b/importers/yelp.com/download_html.py
@@ -1,6 +1,9 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
 import os
 import lxml.html as html
-import urllib2
+import urllib.request, urllib.error, urllib.parse
 import re
 
 
@@ -26,9 +29,9 @@ def get_href_without_ad_redirect(href):
 		href = href[index + len(start_token):]
 		if '&' in href:
 			index = href.index('&')
-			return urllib2.unquote(href[:index])
+			return urllib.parse.unquote(href[:index])
 		else:
-			return urllib2.unquote(href)
+			return urllib.parse.unquote(href)
 
 	return href
 
@@ -55,14 +58,14 @@ def download_location_detail_page(url):
 	name = get_location_name_from_url(url)
 	output_file_name = 'data/raw_html_'+ name +'.html'
 	if not os.path.exists(output_file_name):
-		print ('Downloading details for location: ' + name)
-		response = urllib2.urlopen(url)
+		print('Downloading details for location: ' + name)
+		response = urllib.request.urlopen(url)
 		html = response.read() # returns all the lines in a file.
-		with open(output_file_name, 'w') as html_file:
+		with open(output_file_name, 'wb') as html_file:
 			html_file.write(html)
 			html_file.close()
 	else:
-		print ('Already have details for location: ' + name)
+		print('Already have details for location: ' + name)
 
 	return output_file_name
 
@@ -97,8 +100,8 @@ def auto_page_download(number_of_pages, find_query, location):
 
 	for i in range(number_of_pages):
 		page_results = i * 10
-		search_yelp = (yelp_base_url + 'search?find_desc=' + urllib2.quote(find_query) + '&find_loc='
-			+ urllib2.quote(location) )
+		search_yelp = (yelp_base_url + 'search?find_desc=' + urllib.parse.quote(find_query) + '&find_loc='
+			+ urllib.parse.quote(location))
 		if page_results > 0:
 			search_yelp = search_yelp + '&start=' + str(page_results)
 
@@ -107,13 +110,13 @@ def auto_page_download(number_of_pages, find_query, location):
 		filename = "data/page_" + filename_encode(location) + "_" +  filename_encode(find_query) + "_{}.html".format(i)
 
 		if not os.path.exists(filename):
-			page = urllib2.urlopen(search_yelp) # url specific to
+			page = urllib.request.urlopen(search_yelp) # url specific to
 			html = page.read() # returns all the lines in a file.
-			print ('Generating HTML file: ' + str(filename))
-			with open(filename, 'w') as html_file:
+			print('Generating HTML file: ' + str(filename))
+			with open(filename, 'wb') as html_file:
 				html_file.write(html)
 		else:
-			with open(filename, 'r') as f:
+			with open(filename, 'r', encoding='utf-8') as f:
 				html = f.read()
 				f.close()
 
diff --git a/importers/yelp.com/download_image.py b/importers/yelp.com/download_image.py
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#To do: video download, fold name improment, url improvement
+import os
+import lxml.html as html
+import urllib.request, urllib.error, urllib.parse
+import re
+import time
+
+url_img = 'biz_photos/burma-superstar-san-francisco-2?select_video=7sT6xHsZQAEP8H3pgwlhlg'
+url_yelp = 'https://www.yelp.com/'
+def save_image(url, filename):
+    page = urllib.request.urlopen(url)
+    img_x = page.read()
+    with open(filename, 'wb') as html_file:
+        html_file.write(img_x);
+
+def image_save_url(url):
+    page = urllib.request.urlopen(url)
+    bhtml = page.read()
+    root = html.fromstring(bhtml)
+    Imag_urls = root.cssselect('img.photo-box-img')
+    Imag_url = Imag_urls[0].xpath('@src')[0]
+    folder = (root.xpath("//ul[@class='breadcrumbs']/li/a/text()"))[0]
+    filename = '/' + urllib.parse.urlparse(Imag_url).path.split('/')[2] + os.path.splitext(Imag_url)[1];
+    if not os.path.exists('data/'+ folder):
+        os.makedirs('data/'+ folder)
+    save_image(Imag_url, 'data/'+ folder + filename)
+    
+
+def find_next_url(url):
+    page = urllib.request.urlopen(url)
+    bhtml = page.read()
+    root = html.fromstring(bhtml)
+    next_url = root.xpath("//a[contains(@class, 'js-media-nav_link--next')]/@href")
+    return next_url[0]
+
+if __name__ == '__main__':
+    next_url = url_img;
+    query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
+    while 'video' in list(query_dic)[0]:
+        next_url = find_next_url(url_yelp + next_url)
+        query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
+    image_save_url(url_yelp+next_url)
+    next_url = find_next_url(url_yelp+next_url)
+    count = 0;
+    while(next_url and count < 9):
+        image_save_url(url_yelp + next_url)
+        next_url = find_next_url(url_yelp + next_url)
+        query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
+        while 'video' in list(query_dic)[0]:
+            next_url = find_next_url(url_yelp + next_url)
+            query_dic = urllib.parse.parse_qs(urllib.parse.urlparse(next_url).query).keys()
+        count = count + 1
+        time.sleep(0.3)