import pandas as pd
from pathlib import Path
import os
import github
from .utility import Utility
[docs]class Issues():
"""
Class to aggregate Issues
Attributes
----------
ISSUES_DIR : str
Issues dir where all files are saved in.
ISSUES : str
Pandas table file for issues data.
ISSUES_COMMENTS : str
Pandas table file for comments data in issues.
ISSUES_REACTIONS : str
Pandas table file for reactions data in issues.
ISSUES_EVENTS : str
Pandas table file for reviews data in issues.
Methods
-------
extract_issue_data(issue, users_ids, data_root_dir)
Extracting general issue data.
generate_issue_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True)
Extracting the complete issue data from a repository.
get_issues(data_root_dir, filename=ISSUES)
Get a genearted pandas table.
"""
ISSUES_DIR = "Issues"
ISSUES = "pdIssues.p"
ISSUES_COMMENTS = "pdIssuesComments.p"
ISSUES_REACTIONS = "pdIssuesReactions.p"
ISSUES_EVENTS = "pdIssuesEvents.p"
[docs] @staticmethod
def generate_issue_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True):
"""
generate_issue_pandas_tables(repo, data_root_dir, reactions=False, check_for_updates=True)
Extracting the complete issue data from a repository.
Parameters
----------
repo : Repository
Repository object from pygithub.
data_root_dir : str
Data root directory for the repository.
reactions : bool, default=False
If reactions should also be exracted. The extraction of all reactions increases significantly the aggregation speed.
check_for_updates : bool, default=True
Check first if there are any new issues information.
Notes
-----
PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html
"""
if check_for_updates:
new_issues = repo.get_issues(state='all')
real_issues = []
for issue in new_issues:
if issue._pull_request == github.GithubObject.NotSet:
real_issues.append(issue)
old_issues = Issues.get_issues(data_root_dir)
if not Utility.check_for_updates(real_issues, old_issues):
return
issues_dir = Path(data_root_dir, Issues.ISSUES_DIR)
issues = repo.get_issues(state='all')
users_ids = Utility.get_users_ids(data_root_dir)
issue_list = []
issue_comment_list = []
issue_event_list = []
issue_reaction_list = []
for issue in issues:
# remove pull_requests from issues
if issue._pull_request == github.GithubObject.NotSet:
# issue data
issue_data = Issues.extract_issue_data(issue, users_ids, data_root_dir)
issue_list.append(issue_data)
# issue comment data
for comment in issue.get_comments():
issue_comment_data = Utility.extract_comment_data(comment, issue.id, "issue", users_ids, data_root_dir)
issue_comment_list.append(issue_comment_data)
# issue comment reaction data
if reactions:
for reaction in comment.get_reactions():
reaction_data = Utility.extract_reaction_data(reaction,comment.id,"comment", users_ids, data_root_dir)
issue_reaction_list.append(reaction_data)
# issue event data
for event in issue.get_events():
issue_event_data = Utility.extract_event_data(event, issue.id, "issue", users_ids, data_root_dir)
issue_event_list.append(issue_event_data)
# issue reaction data
if reactions:
for reaction in issue.get_reactions():
issue_reaction_data = Utility.extract_reaction_data(reaction,issue.id, "issue", users_ids, data_root_dir)
issue_reaction_list.append(issue_reaction_data)
# Save lists
Utility.save_list_to_pandas_table(issues_dir, Issues.ISSUES, issue_list)
Utility.save_list_to_pandas_table(issues_dir, Issues.ISSUES_COMMENTS, issue_comment_list)
Utility.save_list_to_pandas_table(issues_dir, Issues.ISSUES_EVENTS, issue_event_list)
if reactions:
Utility.save_list_to_pandas_table(issues_dir, Issues.ISSUES_REACTIONS, issue_reaction_list)
[docs] @staticmethod
def get_issues(data_root_dir, filename=ISSUES):
"""
get_issues(data_root_dir, filename=ISSUES)
Get a genearted pandas table.
Parameters
----------
data_root_dir : str
Data root directory for the repository.
filename : str, default=ISSUES
Pandas table file for issues or comments or reactions or events data.
Returns
-------
DataFrame
Pandas DataFrame which can include the desired data
"""
issues_dir = Path(data_root_dir, Issues.ISSUES_DIR)
pd_issues_file = Path(issues_dir, filename)
if pd_issues_file.is_file():
return pd.read_pickle(pd_issues_file)
else:
return pd.DataFrame()