Jump to content

User:IngenuityBot/getpagemetadata.py

From Wikipedia, the free encyclopedia
import requests, re
from html.parser import HTMLParser

urls = {
    "cbc.ca": "[[CBC News]]",
    "ctvnews.ca": "[[CTV News]]",
    "globalnews.ca": "[[Global News]]",
    "thestar.com": "[[Toronto Star]]",
    "washingtonpost.com": "[[The Washington Post]]",
    "nytimes.com": "[[The New York Times]]",
    "theglobeandmail.com": "[[The Globe and Mail]]",
    "nationalpost.com": "[[National Post]]",
    "apnews.com": "[[Associated Press]]",
    "reuters.com": "[[Reuters]]",
    "bbc.com": "[[BBC News]]",
    "theguardian.com": "[[The Guardian]]",
    "aljazeera.com": "[[Al Jazeera]]",
    "npr.org": "[[NPR]]",
    "nbcnews.com": "[[NBC News]]",
    "usatoday.com": "[[USA Today]]",
    "latimes.com": "[[Los Angeles Times]]",
    "wsj.com": "[[The Wall Street Journal]]",
    "politico.com": "[[Politico]]",
    "bloomberg.com": "[[Bloomberg News]]",
    "axios.com": "[[Axios (website)|Axios]]",
    "businessinsider.com": "[[Business Insider]]",
    "thehill.com": "[[The Hill (newspaper)|The Hill]]",
    "nypost.com": "[[New York Post]]",
    "chicagotribune.com": "[[Chicago Tribune]]",
    "vox.com": "[[Vox (website)|Vox]]",
    "slate.com": "[[Slate (magazine)|Slate]]",
    "theatlantic.com": "[[The Atlantic]]",
    "newyorker.com": "[[The New Yorker]]",
    "time.com": "[[Time (magazine)|Time]]",
    "smh.com.au": "[[The Sydney Morning Herald]]",
    "space.com": "[[Space.com]]",
    "rollingstone.com": "[[Rolling Stone]]",
    "nzherald.co.nz": "[[The New Zealand Herald]]",
    "news.com.au": "[[News.com.au]]",
    "nasa.gov": "[[NASA]]",
    "msnbc.com": "[[MSNBC]]",
    "thejc.com": "[[The Jewish Chronicle]]",
    "theconversation.com": "[[The Conversation (website)|The Conversation]]",
    "hollywoodreporter.com": "[[The Hollywood Reporter]]",
    "gizmodo.com": "[[Gizmodo]]",
    "thediplomat.com": "[[The Diplomat]]",
    "deadline.com": "[[Deadline Hollywood]]",
    "abcnews.go.com": "[[ABC News]]",
    "cnn.com": "[[CNN]]",
    "theverge.com": "[[The Verge]]",
    "theage.com.au": "[[The Age]]",
    "arstechica.com": "[[Ars Technica]]",
    "avclub.com": "[[The A.V. Club]]",
    "buzzfeednews.com": "[[BuzzFeed News]]",
    "csmonitor.com": "[[The Christian Science Monitor]]",
    "cnet.com": "[[CNET]]",
    "telegraph.co.uk": "[[The Daily Telegraph]]",
    "ew.com": "[[Entertainment Weekly]]",
    "forbes.com": "[[Forbes]]",
    "ign.com": "[[IGN]]",
    "qz.com": "[[Quartz (publication)|Quartz]]",
    "scientificamerican.com": "[[Scientific American]]",
    "scmp.com": "[[South China Morning Post]]",
    "variety.com": "[[Variety (magazine)|Variety]]",
    "vogue.com": "[[Vogue (magazine)|Vogue]]",
    "wired.com": "[[Wired (magazine)|Wired]]"
}


class Parser(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        self.elements = []
        self.metadata = {}

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        self.elements.append((tag, attrs))
        if tag == "meta":
            dataname, content = "", ""
            for attr in attrs:
                if attr[0] == "name" or attr[0] == "property":
                    dataname = attr[1]
                elif attr[0] == "content":
                    content = attr[1]
                
            if dataname and content:
                self.metadata[dataname] = content


def getarchiveurl(url):
    response = requests.get("https://archive.org/wayback/available?url=" + url).json()

    if "closest" in response["archived_snapshots"]:
        return response["archived_snapshots"]["closest"]["url"], response["archived_snapshots"]["closest"]["timestamp"]
    else:
        return ("", "")


def getmetadatabysite(url, site, metadata, elements):
    metadata = {
        "title": metadata["og:title"] if "og:title" in metadata else "",
        "date": metadata["article:published_time"] if "article:published_time" in metadata else "",
        "website": site
    }
    match site:
        case "[[The New York Times]]":
            metadata["date"] = metadata["article:published_time"] if "article:published_time" in metadata else ""
        
        case "[[CBC News]]":
            for item in elements:
                if item[0] == "time":
                    for attr in item[1]:
                        if attr[0] == "datetime":
                            metadata["date"] = attr[1]
        
    return metadata


def getpagemetadata(url):
    website = ""
    for item in urls:
        if re.findall("[\.\/]" + item, url):
            website = urls[item]
    if not website:
        return

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0"
    }
    content = requests.get(url, headers=headers).text
    
    parser = Parser()
    parser.feed(content)

    metadata = getmetadatabysite(url, website, parser.metadata, parser.elements)
    if not metadata:
        return
    
    archive_url, archive_date = getarchiveurl(url)
    if archive_url:
        metadata["archive-url"] = archive_url
        metadata["archive-date"] = archive_date[:4] + "-" + archive_date[4:6] + "-" + archive_date[6:8]
        metadata["url-status"] = "live"

    metadata["url"] = url

    return metadata