Source code for api_vulnerabilities.cve_and_cwe.general_scraper

from typing import Dict, Optional, Union
import requests
from requests.exceptions import InvalidSchema, MissingSchema, ConnectionError
from urllib3.exceptions import NewConnectionError
from bs4 import BeautifulSoup
import traceback
import logging.config


logger = logging.getLogger(__name__)
logger.debug("this is a debug message")


[docs]class GeneralScraperError(Exception):
[docs] def __init__(self, message=None, errors=None): super().__init__(message) self.errors = errors
[docs]class GeneralScraper: """ Gets general data from website, like keywords, title, image etc. """
[docs] def __init__(self, url:str): try: page = requests.get(url) except (InvalidSchema, MissingSchema, ) as err: # logging.error(err, exc_info=True) logging.error(err) raise GeneralScraperError(f'Invalid url "{url}"') except (ConnectionError, NewConnectionError) as err: # logging.error(err, exc_info=True) logging.error(err) raise GeneralScraperError(f'Connection error for url "{url}"') except: logging.error("The error is %s", traceback.format_exc()) self.soup = BeautifulSoup(page.text, 'html.parser')
[docs] def get_all(self) -> Dict[str, Union[str, Dict[str, str]]]: """ Returns all founded data """ return { "title": self.get_title(), "description": self.get_description(), "keywords": self.get_keywords(), "image": self.get_image() }
[docs] def get_title(self) -> Optional[str]: return self.soup.head.find('title').text if self.soup.head.find('title') else None
[docs] def get_description(self) -> Optional[Dict[str, str]]: description = self.soup.head.find('meta', attrs={'name': 'description'}) meta_description = description.get('content') if description else None description = self.soup.find("meta", property="og:description") og_description = description.get('content') if description else None response = None if meta_description or og_description: response = { "meta_description": meta_description, "og_description": og_description } return response
[docs] def get_keywords(self) -> Optional[str]: meta_keywords = self.soup.head.find('meta', attrs={'name': 'keywords'}) return meta_keywords.get('content') if meta_keywords else None
[docs] def get_image(self): og_image = self.soup.find("meta", property="og:image") return og_image.get('content') if og_image else None