tests-data

#!/usr/bin/env python3

# This file is part of Cockpit.
#
# Copyright (C) 2017 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.

import gzip
import json
import os
import re
import socket
import ssl
import subprocess
import sys
import tempfile
import time
import urllib.parse
import urllib.request
import urllib.error
import urllib.parse
import zlib

import html.parser

sys.dont_write_bytecode = True

import task

from machine import testvm
from machine.machine_core.directories import BOTS_DIR


# The number of days of previous closed pull requests to learn from
SINCE_DAYS = 120

SEEDED = set()
SINKS = {}


def run(filename, verbose=False, dry=False, **kwargs):
    since = time.time() - 60 * 60 * 24 * SINCE_DAYS
    pulls = Pulls(since)

    # Seed with our input data
    if filename:
        if "/" not in filename and not os.path.exists(filename):
            if not dry:
                subprocess.check_call([os.path.join(BOTS_DIR, "image-download"), "--state", filename])
            filename = os.path.join(testvm.get_images_data_dir(), filename)
        (outfd, outname) = tempfile.mkstemp(prefix=os.path.basename(filename), dir=os.path.dirname(filename))
        os.close(outfd)
        output = gzip.open(outname, 'wb')
        if os.path.exists(filename):
            with gzip.open(filename, 'rb') as fp:
                seed(since, fp, pulls, output)
    else:
        output = sys.stdout.buffer
        outname = None

    def write(**kwargs):
        line = json.dumps(kwargs).encode('utf-8') + b"\n"
        output.write(line)

    # Iterate through all revisions, pull requests on this branch
    for (commit, merged, created, pull) in commits("master", pulls, since, verbose):
        logged = False
        if verbose:
            sys.stderr.write("- {0}\n".format(commit))
        for (context, created, url, log) in logs(commit):
            if verbose:
                sys.stderr.write("  - {0} {1}\n".format(created, context))
            for (status, name, body, tracker) in tap(log):
                write(pull=pull, revision=commit, status=status,
                      context=context, date=created, merged=merged,
                      test=name, url=url, tracker=tracker, log=body)
                logged = True

            # Nothing found for this log
            if not logged:
                write(pull=pull, revision=commit, status="unknown", date=created,
                      merged=merged, url=url, log=log)
                logged = True

        # Nothing found for this revision
        if not logged:
            write(pull=pull, revision=commit, status="unknown", date=created, merged=merged)
            logged = True

    sys.stdout.flush()
    if output:
        output.close()
    if outname:
        os.rename(outname, filename)

    if not dry and outname and filename:
        upload = [os.path.join(BOTS_DIR, "image-upload"), "--state", filename]
        subprocess.check_call(upload)


# An HTML parser that just pulls out all the <a href="...">
# link hrefs in a given page of content. We also qualify these
# hrefs with a base url, in case they're relative
class HrefParser(html.parser.HTMLParser):
    def __init__(self, base, hrefs):
        html.parser.HTMLParser.__init__(self)
        self.hrefs = hrefs
        self.base = base

    def handle_starttag(self, tag, attrs):
        if tag.lower() == "a":
            for (name, value) in attrs:
                if name.lower() == "href":
                    url = urllib.parse.urljoin(self.base, value)
                    # print 'HREF', url
                    self.hrefs.append(url)


# Check if a given pull request was included in its base
# branch via merging or otherwise
class Pulls():
    def __init__(self, since):
        self.fetched = {}
        self.checked = {}
        self.pulls = {}
        self.listing = []
        self.since = since

    # Get all the pull requests since a given time
    def __iter__(self):
        if self.listing:
            iterate = self.pulls.values()
        else:
            iterate = task.api.pulls(state="all", since=self.since)
        listing = []
        for pull in iterate:
            self.pulls[pull["number"]] = pull
            listing.append(pull)
            yield pull
        self.listing = listing

    # Turn a stning/int pull number into an pull object
    def normalize(self, pull):
        if isinstance(pull, int):
            pull = str(pull)
        if isinstance(pull, str):
            if "/" not in pull:
                pull = qualify("pulls/{0}".format(pull))
            if pull in self.pulls:
                pull = self.pulls[pull]
            else:
                pull = task.api.get(pull)
                self.pulls[pull["url"]] = pull
        elif not isinstance(pull, dict):
            raise ValueError("Invalid pull request: {0}".format(repr(pull)))
        return pull

    def merged(self, pull):
        pull = self.normalize(pull)
        # if not pull:
        #     return None

        number = pull["number"]

        if number in self.checked:
            return self.checked[number]

        if pull.get("state") != "closed":
            return None

        # GitHub is telling us this was merged
        if pull.get("merged"):
            return True

        # Fetch git data about this branch
        cwd = os.path.dirname(__file__)
        base = pull["base"]["ref"]
        if base not in self.fetched:
            try:
                subprocess.check_call(["git", "fetch", "-q", "--", "origin", base], cwd=cwd)
            except subprocess.CalledProcessError:
                return None  # error already printed by process
            self.fetched[base] = base

        # Look for git commits up until a year before the pull request
        when = time.mktime(time.strptime(pull["created_at"], "%Y-%m-%dT%H:%M:%SZ"))
        when -= 60 * 60 * 24 * 365
        since = time.strftime("%Y-%m-%d", time.gmtime(when))

        # Check if it's referred to in this branch
        match = "(Closes|Fixes|closes|fixes).*{0}".format(number)
        cmd = [
            "git", "log", "--extended-regexp", "--grep", match,
            "--since=" + since, "origin/" + base
        ]
        output = subprocess.check_output(cmd, cwd=cwd)
        self.checked[number] = output and True or False
        return self.checked[number]


# Retrieves the content of the given URL
def retrieve(url):
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    req = urllib.request.urlopen(url, context=ctx)
    return req.read().decode('utf-8', 'replace')


# Returns a list of all results at the given URL
def links(url):
    result = []
    parser = HrefParser(url, result)
    try:
        parser.feed(retrieve(url))
    except urllib.error.HTTPError as ex:
        if ex.code != 404:
            raise
    except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
        sys.stderr.write("{0}: {1}\n".format(url, ex))
    return result


# Parses seed input data and passes it through to output
# all the while preparing the fact that certain URLs have
# already been seen
def seed(since, fp, pulls, output):
    seeded = None
    known = re.compile("# SKIP Known issue #([0-9]+)", re.IGNORECASE)

    while True:
        try:
            line = fp.readline()
        except (OSError, zlib.error) as ex:
            sys.stderr.write("tests-data: {0}\n".format(str(ex)))
            break
        if not line:
            break
        try:
            item = json.loads(line.decode('utf-8'))
        except ValueError as ex:
            sys.stderr.write("tests-data: {0}\n".format(str(ex)))
            continue

        # Once we see a new pull treat the old one as complete and seeded
        # As a failsafe, just to make sure we didn't miss something
        # wo don't treat the last pull request as completely seeded
        pull = item.get("pull")
        if pull and pull != seeded:
            SEEDED.add(seeded)
            seeded = None

        if pull and item.get("merged") not in [True, False]:
            item["merged"] = pulls.merged(pull)

        # Note that we've already retrieved this URL
        url = item.get("url")
        if url and item.get("log") is not None:
            SEEDED.add(url)
            SEEDED.add(urllib.parse.urljoin(url, "./"))

        # If the pull request had a known merged value it can be seeded
        # This forces us to retrieve data about open pull requests again
        if item["merged"] in [True, False]:
            seeded = pull
            SEEDED.add(item["revision"])

        date = item.get("date")
        if not date or since > time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%SZ")):
            continue

        # COMPAT: Fix data that wasn't yet valid
        if item["status"] == "skip":
            match = known.search(item["log"])
            if match:
                item["status"] = "failure"
                item["tracker"] = qualify("issues/{0}".format(match.group(1)))

        line = json.dumps(item).encode('utf-8') + b"\n"
        output.write(line)


# Generate a list of (revision, merged, url) for the given branch
# This includes pull requests targeting the branch in question
#
# revision: the SHA of a commit
# merged: True/False/None whether merged or not
# url: The URL for the pull request or None
def commits(branch, pulls, since, verbose=False):
    if verbose:
        sys.stderr.write("{0}\n".format(branch))

    # Iterate through commits on master
    for commit in task.api.commits(branch, since=since):
        revision = commit["sha"].lower()
        if revision not in SEEDED:
            yield revision, True, commit["commit"]["committer"]["date"], None

    # Iterate through pull requests
    for pull in pulls:
        if pull["number"] in SEEDED:
            continue
        if pull["base"]["ref"] != branch:
            continue
        if verbose:
            sys.stderr.write("pull-{0}\n".format(pull["number"]))
        merged = pulls.merged(pull)

        for revision in revisions(pull):
            yield revision, merged, pull["created_at"], pull["url"]

            # The next revisions for the pull request are not the ones
            # that got merged. Only the first one produced by revisions
            if merged:
                merged = False


# Get all the revisions in a pull request. GitHub doesn't help
# us here so we have to use silly tricks
def revisions(pull):
    head = pull.get("head", {}).get("sha")
    if not head:
        return

    # First give back the main pull request
    head = head.lower()
    yield head

    # All the revisions we've seen
    seen = set([head])

    # Seed the set of sinks. We use these sinks to figure out additional
    # revisions for the pull request. Unfortunately GitHub doesn't help us
    # with a list of revisions that this pull request used to reflect. So
    # we have to look to our sink for that info.
    data = task.api.get("commits/{0}/status?page=1&per_page=100".format(head))
    for status in data.get("statuses", []):
        url = status["target_url"]
        if url:
            SEEDED.add(urllib.parse.urljoin(url, "./"))
            sink = urllib.parse.urljoin(url, "../")
            if sink not in SINKS:
                SINKS[sink] = links(sink)

    # Now ask each sink for its set of urls
    name = "pull-{0}".format(pull["number"])
    for sink in SINKS:
        for link in SINKS[sink]:

            # We only care about stuff at the sink where pull-XXXX is in
            # the URL. This is how we figure out whether things are related
            if name not in link:
                continue

            # Already retrieved this one
            if link in SEEDED:
                continue

            # Build a URL for the cockpituous sink /status file and read it
            target = urllib.parse.urljoin(link, "status")
            try:
                data = json.loads(retrieve(target))
            except (ValueError, ConnectionError) as ex:
                sys.stderr.write("{0}: {1}\n".format(target, ex))
            except urllib.error.HTTPError as ex:
                if ex.code != 404:
                    raise
            except urllib.error.URLError as ex:
                sys.stderr.write("{0}: {1}\n".format(target, ex))
                pass
            else:
                # The status file contains a "revision" field which is the git revision
                # of what was tested during that test run. This is what we're after
                if "revision" in data:
                    revision = data["revision"].lower()
                    if revision not in seen:
                        seen.add(revision)
                        yield revision


# Pull out all status (context, created, log) for a given revision. This includes multiple
# test runs for a given revision, and all the various status contexts
def logs(revision):
    page = 1
    count = 100
    while count == 100:
        data = task.api.get("commits/{0}/status?page={1}&per_page={2}".format(revision, page, count))
        count = 0
        for status in data.get("statuses", []):
            count += 1
            # Make sure to not consider "state": "success" as a success
            # here because individual tests may have failed, or been retried.
            #
            # Always only consider tests individually to have run or failed
            # not entire test suite statuses
            if status["state"] in ["pending"]:
                continue
            target = status.get("target_url")
            if not target:
                continue
            if target.endswith(".html"):
                target = target[:-5]
            if target in SEEDED:
                continue
            log = None
            try:
                log = retrieve(target)
            except urllib.error.HTTPError as ex:
                if ex.code != 404:
                    raise
                log = ""
            except (ConnectionResetError, urllib.error.URLError, socket.gaierror) as ex:
                sys.stderr.write("{0}: {1}\n".format(target, ex))
            if log is not None:
                yield (status["context"], status["created_at"], target, log)


# Generate (status, name, body, tracker) for each Test Anything Protocol test
# in the content.
#
# status: possible values "success", "failure", "skip"
# name: the name of the test
# body: full log of the test
# tracker: url tracking the failure, or None
def tap(content):
    name = status = tracker = None
    prefix = None
    body = []
    blocks = False
    for line in content.split('\n'):
        # The test intro, everything before here is fluff
        if not prefix and line.startswith("1.."):
            prefix = line
            body = []
            name = status = tracker = None

        # A TAP test status line
        elif line.startswith("ok ") or line.startswith("not ok "):
            body.append(line)
            # Parse out the status
            if line.startswith("not ok "):
                status = "failure"
                line = line[7:]
            else:
                line = line[3:]
                if "# SKIP KNOWN ISSUE" in line.upper():
                    status = "failure"
                    (unused, delim, issue) = line.partition("#")
                    tracker = qualify("issues/{0}".format(issue))
                if "# SKIP" in line.upper():
                    status = "skip"
                else:
                    status = "success"
            # Parse out the name
            while line[0].isspace() or line[0].isdigit():
                line = line[1:]
            (name, delim, directive) = line.partition("#")
            (name, delim, directive) = name.partition("duration")
            name = name.strip()
            # Old Cockpit tests had strange blocks
            if not blocks:
                yield (status, name, "\n".join(body), tracker)
                status = name = tracker = None
                body = []
        else:
            # Old Cockpit tests didn't separate bound their stuff properly
            if line.startswith("# --------------------"):
                blocks = True
                if status:
                    yield (status, name, "\n".join(body), tracker)
                name = status = tracker = None
                body = []
            body.append(line)


# Qualify a URL into the GitHub repository
def qualify(path):
    return "https://api.github.com" + task.api.qualify(path)


if __name__ == '__main__':
    task.main(function=run, title="Pull out test data for pull requests", verbose=True)