|
@@ -10,6 +10,7 @@ import unicodedata
|
|
|
from contextlib import contextmanager
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
+import cloudscraper
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
|
|
from newspaper import Article
|
|
@@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
|
|
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
|
|
|
|
|
- if response.status_code != 200:
|
|
|
- return "URL returned status code {}.".format(response.status_code)
|
|
|
+ if response.status_code == 200:
|
|
|
+ # check content-type
|
|
|
+ content_type = response.headers.get('Content-Type')
|
|
|
+ if content_type:
|
|
|
+ main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
|
|
+ else:
|
|
|
+ content_disposition = response.headers.get('Content-Disposition', '')
|
|
|
+ filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
|
|
+ if filename_match:
|
|
|
+ filename = unquote(filename_match.group(1))
|
|
|
+ extension = re.search(r'\.(\w+)$', filename)
|
|
|
+ if extension:
|
|
|
+ main_content_type = mimetypes.guess_type(filename)[0]
|
|
|
|
|
|
- # check content-type
|
|
|
- content_type = response.headers.get('Content-Type')
|
|
|
- if content_type:
|
|
|
- main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
|
|
|
- else:
|
|
|
- content_disposition = response.headers.get('Content-Disposition', '')
|
|
|
- filename_match = re.search(r'filename="([^"]+)"', content_disposition)
|
|
|
- if filename_match:
|
|
|
- filename = unquote(filename_match.group(1))
|
|
|
- extension = re.search(r'\.(\w+)$', filename)
|
|
|
- if extension:
|
|
|
- main_content_type = mimetypes.guess_type(filename)[0]
|
|
|
+ if main_content_type not in supported_content_types:
|
|
|
+ return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
|
|
|
|
|
- if main_content_type not in supported_content_types:
|
|
|
- return "Unsupported content-type [{}] of URL.".format(main_content_type)
|
|
|
+ if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
|
|
+ return ExtractProcessor.load_from_url(url, return_text=True)
|
|
|
|
|
|
- if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
|
|
- return ExtractProcessor.load_from_url(url, return_text=True)
|
|
|
+ response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
|
|
+ elif response.status_code == 403:
|
|
|
+ scraper = cloudscraper.create_scraper()
|
|
|
+ response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
|
|
+
|
|
|
+ if response.status_code != 200:
|
|
|
+ return "URL returned status code {}.".format(response.status_code)
|
|
|
|
|
|
- response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
|
|
a = extract_using_readabilipy(response.text)
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip():
|