Source code for github2pandas.workflows

import requests
import zipfile
import io
import pandas as pd
from pathlib import Path
from .utility import Utility

[docs]class Workflows(object):
    """
    Class to aggregate Workflows

    Attributes
    ----------
    WORKFLOWS_DIR : str
        workflow dir where all files are saved in.
    WORKFLOWS : str
        Pandas table file for workflow data.
    WORKFLOWS_RUNS : str
        Pandas table file for run data.

    Methods
    -------
    extract_workflow_data(workflow)
        Extracting general workflow data.
    extract_workflow_run_data(workflow_run)
        Extracting general workflow run data.
    generate_workflow_pandas_tables(repo, data_root_dir, check_for_updates=True)
        Extracting the complete workflow list and run history from a repository.
    download_workflow_log_files(repo, github_token, workflow_run_id, data_root_dir)
        Receive workflow log files from GitHub.
    get_workflows(data_root_dir, filename=WORKFLOWS)
        Get a generated pandas tables.
    
    """

    WORKFLOWS_DIR = "Workflows"
    WORKFLOWS = "pdWorkflows.p"
    WORKFLOWS_RUNS =  "pdWorkflowsRuns.p"

[docs]    @staticmethod
    def extract_workflow_data(workflow):
        """
        extract_workflow_data(workflow)

        Extracting general workflow data.

        Parameters
        ----------
        workflow : Workflow
            Workflow object from pygithub.

        Returns
        -------
        dict
            Dictionary with the extracted data.

        Notes
        -----
           PyGithub Workflow object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Workflow.html

        """
        workflow_data = {}
        workflow_data["id"] = workflow.id
        workflow_data['name'] = workflow.name
        workflow_data['created_at'] = workflow.created_at
        workflow_data['updated_at'] = workflow.updated_at
        workflow_data["state"] = workflow.state
        return workflow_data
    
[docs]    @staticmethod
    def extract_workflow_run_data(workflow_run):
        """
        extract_workflow_run_data(workflow_run)

        Extracting general workflow run data.

        Parameters
        ----------
        workflow_run : WorkflowRun
            WorkflowRun object from pygithub.

        Returns
        -------
        dict
            Dictionary with the extracted data.

        Notes
        -----
            PyGithub WorkflowRun object structure: https://pygithub.readthedocs.io/en/latest/github_objects/WorkflowRun.html

        """
        workflow_run_data = dict()
        workflow_run_data["workflow_id"] = workflow_run.workflow_id
        workflow_run_data['id'] = workflow_run.id
        workflow_run_data['commit_sha'] = workflow_run.head_sha
        workflow_run_data['pull_requests'] = [pr.id for pr in workflow_run.pull_requests]
        workflow_run_data['state'] = workflow_run.status
        workflow_run_data['event'] = workflow_run.event
        workflow_run_data['conclusion'] = workflow_run.conclusion
        workflow_run_data['created_at'] = workflow_run.created_at
        workflow_run_data['updated_at'] = workflow_run.updated_at
        return workflow_run_data

[docs]    @staticmethod
    def generate_workflow_pandas_tables(repo, data_root_dir, check_for_updates=True):
        """
        generate_workflow_pandas_tables(repo, data_root_dir, check_for_updates=True)

        Extracting the complete workflow list and run history from a repository.

        Parameters
        ----------
        repo : Repository
            Repository object from pygithub.
        data_root_dir : str
            Data root directory for the repository.
        check_for_updates : bool, default=True
            Check first if there are any new workflows or workflow_runs information.
            
        Notes
        -----
            PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
        
        """

        workflow_dir = Path(data_root_dir, Workflows.WORKFLOWS_DIR)
        workflow_dir.mkdir(parents=True, exist_ok=True)
        users_ids = Utility.get_users_ids(data_root_dir)

        workflows = repo.get_workflows()
        workflow_runs = repo.get_workflow_runs()

        if check_for_updates:
            old_workflows = Workflows.get_workflows(data_root_dir)
            check_workflows = Utility.check_for_updates_paginated(workflows, old_workflows)
            old_workflow_runs = Workflows.get_workflows(data_root_dir,Workflows.WORKFLOWS_RUNS)
            check_workflow_runs = Utility.check_for_updates_paginated(workflow_runs, old_workflow_runs)
            if not check_workflows and not check_workflow_runs:
                return
        
        workflow_list = []
        for workflow in workflows:
            workflow_data = Workflows.extract_workflow_data(workflow)
            workflow_list.append(workflow_data)
        Utility.save_list_to_pandas_table(workflow_dir, Workflows.WORKFLOWS, workflow_list)

        workflow_run_list = []
        for workflow_run in workflow_runs:
            workflow_run_data = Workflows.extract_workflow_run_data(workflow_run)
            workflow_run_data['author'] = Utility.extract_committer_data_from_commit(repo, workflow_run_data['commit_sha'], users_ids, data_root_dir)
            workflow_run_list.append(workflow_run_data)
        Utility.save_list_to_pandas_table(workflow_dir, Workflows.WORKFLOWS_RUNS, workflow_run_list)

[docs]    @staticmethod
    def download_workflow_log_files(repo, github_token, workflow_run_id, data_root_dir):
        """
        download_workflow_log_files(repo, github_token, workflow_run_id, data_root_dir)

        Receive workflow log files from GitHub.

        Parameters
        ----------
        repo : Repository
            Repository object from pygithub.       
        github_token : str
            Authentication token for GitHub access.
        workflow_run_id : int
            Workflow Run Id to download one specific workflow run.  
        data_root_dir : str
            Data root directory for the repository.

        Returns
        -------
        int
            Number of downloaded files.

        Notes
        -------
            Download api https://docs.github.com/en/rest/reference/actions#list-jobs-for-a-workflow-run
            Generation of python code based on https://curl.trillworks.com/
            PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
            PyGithub WorkflowRun object structure: https://pygithub.readthedocs.io/en/latest/github_objects/WorkflowRun.html

        """
        headers = {
            'Accept': 'application/vnd.github.v3+json',
        }
        query_url = f"https://api.github.com/repos/{repo.owner.login}/{repo.name}/actions/runs/{workflow_run_id}/logs"
        response = requests.get(query_url, headers=headers,
                                auth=('username', github_token))
        if 'zip' in response.headers['Content-Type']:
            zip_obj = zipfile.ZipFile(io.BytesIO(response.content))
            data_dir = Path(data_root_dir, Workflows.WORKFLOWS_DIR, str(workflow_run_id))
            zip_obj.extractall(data_dir)
            return len(zip_obj.namelist())
        else:
            return None

[docs]    @staticmethod
    def get_workflows(data_root_dir, filename=WORKFLOWS):
        """
        get_workflows(data_root_dir, filename=WORKFLOWS)

        Get a generated pandas tables.

        Parameters
        ----------
        data_root_dir : str
            Data root directory for the repository.
        filename : str, default=WORKFLOWS
            Pandas table file for workflows or workflows runs data.

        Returns
        -------
        DataFrame
            Pandas DataFrame which can include the desired data.

        """
        workflow_dir = Path(data_root_dir, Workflows.WORKFLOWS_DIR)
        pd_workflows_file = Path(workflow_dir, filename)
        if pd_workflows_file.is_file():
            return pd.read_pickle(pd_workflows_file)
        else:
            return pd.DataFrame()