Source code for api_vulnerabilities.cve_and_cwe.mitre_cwe_scrapers

"""
Scrapery do danych ze strony https://cwe.mitre.org/

CVE-2019-4570
https://nvd.nist.gov/vuln/detail/CVE-2019-4570#vulnCurrentDescriptionTitle
Scrapery do danych ze strony https://nvd.nist.gov
"""
from rest_framework.reverse import reverse
from typing import List, Dict
from bs4 import BeautifulSoup
import requests
import re


[docs]class CWETableTop25Scraper:
    """
    Scraper danych - pobiera dane z tabeli dla top 25 słabości oprogramowania z https://cwe.mitre.org/top25/archive/2020/2020_cwe_top25.html.
    """
    top_25_url = "https://cwe.mitre.org/top25/archive/2020/2020_cwe_top25.html"
    cwe_mitre_url = "https://cwe.mitre.org"

[docs]    def __init__(self, host_address):
        self.host_address = host_address

[docs]    def get_top_25(self):
        """
        SCRAPER - Pobiera dane ze strony - zwraca top 25 najpopularniejszych słabości.
        :return:
        """
        result = []
        source = requests.get(CWETableTop25Scraper.top_25_url).text
        soup = BeautifulSoup(source, 'lxml')
        # pobranie tabelki ze szczególami top 25 cwe
        detail_table = soup.find("table", {"id": "Detail"})

        # wszystkie wiersze tabeli
        rows = detail_table.findAll("tr")
        for r in rows[1:]:
            # bez pierwszego, bo pierwszy wiersz nie zawiera danych
            row_td = r.findAll("td")
            description = row_td[2].string
            score = row_td[3].string
            rank = r.b.string[1:-1]

            href = r.find("a")
            definition_url = CWETableTop25Scraper.cwe_mitre_url + href["href"]
            ID_CWE = href.string

            result.append({
                "rank" : rank,
                "ID_CWE": ID_CWE,
                "description": description,
                "score": score,
                "definition_url" : definition_url,
                # adres do wnętrza apliakcji
                "sarenka_url": self.host_address + reverse('get_by_cwe', kwargs={"id_cwe": ID_CWE}),
            })

        return result


[docs]class CWEDataScraper:
    cwe_mitre_url = "https://cwe.mitre.org/data/definitions/"

[docs]    def __init__(self, id_cwe: str, host_address=None):
        self.host_address = host_address
        if "-" in id_cwe:
            id_cwe = id_cwe.split("-")[1] # dla postaci CWE-79
        self.id_cwe = id_cwe
        self.cwe_url = self.generate_definition_url()

        source = requests.get(self.cwe_url).text
        self.soup = BeautifulSoup(source, 'lxml')

[docs]    def generate_definition_url(self)->str:
        return self.cwe_mitre_url + self.id_cwe + ".html"

[docs]    def get_title(self):
        """Zwraca tytuł slabości."""
        try:
            title = self.soup.find("h2").string
            title = title.split(":")[1]
        except AttributeError:
            title = self.soup.find("h2")
        return title.strip()

[docs]    def get_description(self)->str:
        description = self.soup.find("div", {"id": "oc_" + self.id_cwe + "_Description" })
        return description.string

    def get_likelihood(self)->str:
        """Poziom prawdopodobieństwa istnienia exploitów i samej exploitacji słabości."""
        likehood = self.soup.find("div", {"id": "oc_"+ self.id_cwe + "_Likelihood_Of_Exploit" })
        return likehood.string

[docs]    def get_likelihood(self)->str:
        """Poziom prawdopodobieństwa istnienia exploitów i samej exploitacji słabości."""
        likehood = self.soup.find("div", {"id": "oc_"+ self.id_cwe + "_Likelihood_Of_Exploit" })
        if likehood:
            return likehood.string
        return "No information about exploitation likehood"

[docs]    def get_technical_impact(self)->List[str]:
        """Częsre konswekwencje exploitacji słabości."""
        result = []

        try:
            div = self.soup.find("div", {"id": "oc_" + self.id_cwe + "_Common_Consequences" })
            table = div.find("table")
            tr = table.findAll("tr")
            for i in tr[1:]: # be zpierwszego wiersza bo tam nie ma danych
                row = i.find("p", {"class": "smaller"})
                # tylko jeden wynik - re.findall zwraca listę
                impact = re.findall("<i>(.*?)</i>", str(row))[0]
                impact = impact.split(";")
                impact = [i.strip() for i in impact]
                result.extend(impact)

            # bez duplikatow - wydajniej zamienic na slwonik
            return list(set(result))

        #TODO refaktor
        except Exception:
            return list["No information"]

[docs]    def get_caused_by(self):
        """
        Etap podczas którego powstaje podatność. Np. podczas implementacji.
        """
        try:
            div_main = self.soup.find("div", {"id": "oc_" + self.id_cwe +"_Modes_Of_Introduction"})
            table = div_main.find("table")
            tr = table.findAll("tr")

            field = tr[1].text # np.: Architecture and Design
            all_td = tr[-1].findAll("td")

            process = all_td[0].text # np.: Implementation
            description = all_td[1].text # This weakness is caused during implementation of an architectural security tactic.
            description = description.split(":")[-1].strip()

            return {
                "field": field,
                "process": process,
                "description": description
            }

        # TODO: refaktor
        except AttributeError:
            return {
                "field": "No information",
                "process": "No information",
                "description": "No information"
            }

[docs]    def get_cve_examples(self)->List[Dict]:
        """
        Przykładowe podatności bezpieczeństwa w konkretnych oprogramowanaich dla tego typu słabości oprogramowania.
        """
        result = []
        try:
            div_main = self.soup.find("div", {"id": "oc_" + self.id_cwe +"_Observed_Examples"})
            table = div_main.find("table", {"class": "Detail"})
            tr_list = table.findAll("tr")

            for tr in tr_list[1:]: # w pierwszym wierszu nie ma danych
                id_CVE = None
                description = None

                if tr.find("div", {"class": "indent"}):
                    description = tr.find("div", {"class": "indent"}).text

                if tr.find("a"):
                    id_CVE = tr.find("a").text
                    mitre_url = tr.find("a")["href"]

                result.append({
                    "id_CVE": id_CVE,
                    "description": description,
                    "mitre_url": mitre_url, # https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2006-3568
                    "sarenka_url": self.host_address + reverse('get_by_cve', kwargs={"code": id_CVE}),
                })
        # TODO: refaktor
        except Exception:
            result.append({
                "id_CVE": "No inforamtion",
                "description": "No inforamtion",
                "mitre_url": "No inforamtion",  # https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2006-3568
                "sarenka_url": "No inforamtion",
            })

        return result

[docs]    def get_data(self)->Dict:
        """
        Zwraca wszystkie dane wyciągniete podczas scrapowania.
        """
        source = requests.get(self.cwe_url).text
        soup = BeautifulSoup(source, 'lxml')

        result= {
            "ID_CWE" : "CWE-"+self.id_cwe,
            "title": self.get_title(),
            "description": self.get_description(),
            "likehood": self.get_likelihood(),
            "technical_impact": self.get_technical_impact(),
            "caused_by": self.get_caused_by(),
            "cve_examples": self.get_cve_examples()
        }

        return result