import pandas as pd
from pathlib import Path
import os
import github
from .utility import Utility
[docs]class PullRequests():
"""
Class to aggregate Pull Requests
Attributes
----------
PULL_REQUESTS_DIR : str
Pull request dir where all files are saved in.
PULL_REQUESTS : str
Pandas table file for pull request data.
PULL_REQUESTS_COMMENTS : str
Pandas table file for comments data in pull requests.
PULL_REQUESTS_REACTIONS : str
Pandas table file for reactions data in pull requests.
PULL_REQUESTS_REVIEWS : str
Pandas table file for reviews data in pull requests.
PULL_REQUESTS_EVENTS : str
Pandas table file for events data in pull requests.
PULL_REQUESTS_COMMITS : str
Pandas table file for commits data in pull requests.
Methods
-------
extract_pull_request_data(pull_request, users_ids, data_root_dir)
Extracting general pull request data.
extract_pull_request_review_data(review, pull_request_id, users_ids, data_root_dir)
Extracting general review data from a pull request.
extract_pull_request_commit_data(review, users_ids, pull_request_id)
Extracting commit data from a pull request.
generate_pull_request_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True)
Extracting the complete pull request data from a repository.
get_pull_requests(data_root_dir, filename=PULL_REQUESTS))
Get a genearted pandas table.
"""
PULL_REQUESTS_DIR = "PullRequests"
PULL_REQUESTS = "pdPullRequests.p"
PULL_REQUESTS_COMMENTS = "pdPullRequestsComments.p"
PULL_REQUESTS_REACTIONS = "pdPullRequestsReactions.p"
PULL_REQUESTS_REVIEWS = "pdPullRequestsReviews.p"
PULL_REQUESTS_EVENTS = "pdPullRequestsEvents.p"
PULL_REQUESTS_COMMITS = "pdPullRequestsCommits.p"
[docs] @staticmethod
def generate_pull_request_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True):
"""
generate_pull_request_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True)
Extracting the complete pull request data from a repository.
Parameters
----------
repo : Repository
Repository object from pygithub.
data_root_dir : str
Data root directory for the repository.
reactions : bool, default=False
If reactions should also be exracted. The extraction of all reactions increases significantly the aggregation speed.
check_for_updates : bool, default=True
Check first if there are any new pull requests information.
Notes
-----
PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
"""
if check_for_updates:
pull_requests = repo.get_pulls(state='all')
old_pull_requests = PullRequests.get_pull_requests(data_root_dir)
if not Utility.check_for_updates_paginated(pull_requests, old_pull_requests):
return
pull_request_dir = Path(data_root_dir, PullRequests.PULL_REQUESTS_DIR)
pull_requests = repo.get_pulls(state='all')
users_ids = Utility.get_users_ids(data_root_dir)
pull_request_list = []
pull_request_comment_list = []
pull_request_reaction_list = []
pull_request_review_list = []
pull_request_event_list = []
pull_request_commit_list = []
# pull request data
for pull_request in pull_requests:
pull_request_data = PullRequests.extract_pull_request_data(pull_request, users_ids, data_root_dir)
pull_request_list.append(pull_request_data)
# pull request comment data
for comment in pull_request.get_comments():
pull_request_comment_data = Utility.extract_comment_data(comment, pull_request.id, "pull_request", users_ids, data_root_dir)
pull_request_comment_list.append(pull_request_comment_data)
# pull request reaction data
if reactions:
for reaction in comment.get_reactions():
reaction_data = Utility.extract_reaction_data(reaction,comment.id, "comment", users_ids, data_root_dir)
pull_request_reaction_list.append(reaction_data)
# pull request review data
for review in pull_request.get_reviews():
pull_request_review_data = PullRequests.extract_pull_request_review_data(review, pull_request.id, users_ids, data_root_dir)
pull_request_review_list.append(pull_request_review_data)
# pull request issue comments data
for comment in pull_request.get_issue_comments():
pull_request_comment_data = Utility.extract_comment_data(comment, pull_request.id, "pull_request", users_ids, data_root_dir)
pull_request_comment_list.append(pull_request_comment_data)
# pull request reaction data
if reactions:
for reaction in comment.get_reactions():
reaction_data = Utility.extract_reaction_data(reaction,comment.id, "comment", users_ids, data_root_dir)
pull_request_reaction_list.append(reaction_data)
# pull request issue events
for event in pull_request.get_issue_events():
pull_request_event_data = Utility.extract_event_data(event, pull_request.id, "pull_request", users_ids, data_root_dir)
pull_request_event_list.append(pull_request_event_data)
# pull request commits
for commit in pull_request.get_commits():
pull_request_commit_data = PullRequests.extract_pull_request_commit_data(commit, pull_request.id)
pull_request_commit_list.append(pull_request_commit_data)
# Save lists
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS, pull_request_list)
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS_COMMENTS, pull_request_comment_list)
if reactions:
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS_REACTIONS, pull_request_reaction_list)
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS_REVIEWS, pull_request_review_list)
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS_EVENTS, pull_request_event_list)
Utility.save_list_to_pandas_table(pull_request_dir, PullRequests.PULL_REQUESTS_COMMITS, pull_request_commit_list)
[docs] @staticmethod
def get_pull_requests(data_root_dir, filename=PULL_REQUESTS):
"""
get_pull_requests(data_root_dir, filename=PULL_REQUESTS))
Get a genearted pandas table.
Parameters
----------
data_root_dir : str
Data root directory for the repository.
filename : str, default=PULL_REQUESTS
Pandas table file for pull requests or comments or reactions or reviews or events data.
Returns
-------
DataFrame
Pandas DataFrame which can includes the desired data
"""
pull_request_dir = Path(data_root_dir, PullRequests.PULL_REQUESTS_DIR)
pd_pull_requests_file = Path(pull_request_dir, filename)
if pd_pull_requests_file.is_file():
return pd.read_pickle(pd_pull_requests_file)
else:
return pd.DataFrame()