说明
一个url处理函数非常耗时,导致nginx在等待过程中超时。改成并发处理,问题解决。记录下代码片段。
代码
# 调用过程
for obj in queryset:
res1 = parse_domains(obj.ilink)
if 'href_domains' in res1:
threads = []
for url in res1['href_domains']:
domain = url.split('/')[2]
#排除已存在的网站
if domain in exclude_urls:
continue
t = threading.Thread(target=processing_url_func, args=(url,))
threads.append(t)
for p in threads:
p.start()
obj.status = 0
obj.save()
#暂停4秒
time.sleep(4)
# 子进程函数
def processing_url_func(url):
"""
处理url
"""
#获取网站信息
res2 = get_web_info(url)
if res2:
#print(res2)
iLi = iLinks.objects.create(title=res2[0],note=res2[1],home=url)
iLi.save()
# 获取url信息
def get_web_info(url):
session = sessions()
req_header = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
print(url)
try:
tag = 0
resp = session.get(url, headers=req_header, allow_redirects=True, verify=False)
except ReadTimeout as e:
tag = 1
print(e)
except ConnectionError as e:
tag = 2
print(e)
except RequestException as e:
tag =3
print(e)
if tag > 0:
return False
soup = BeautifulSoup(resp.text, 'html.parser')
title = ''
note = ''
if soup.title:
title = _filter_emoji(soup.title.string)
keywords = soup.find(attrs={"name":"keywords"})
if keywords:
note = _filter_emoji(keywords['content'])
if title and note:
return [title, note]
else:
return False
评论 (0)