Source code for github2pandas.utility

import os
from pathlib import Path
import numpy
import pandas as pd
import github
import pickle
from human_id import generate_id
import json
import uuid

[docs]class Utility(): """ Class which contains methods for mutiple modules. Attributes ---------- USERS : str Pandas table file for user data. REPO : str Json file for general repository informations. Methods ------- check_for_updates(new_list, old_df) Check if id and updated_at are in the old_df. check_for_updates_paginated(new_paginated_list, old_df) Check if id and updated_at are in the old_df. save_list_to_pandas_table(dir, file, data_list) Save a data list to a pandas table. get_repo_informations(data_root_dir) Get a repository data (owner and name). get_repos(token, data_root_dir, whitelist_patterns=None, blacklist_patterns=None) Get mutiple repositorys by pattern and token. get_repo(repo_owner, repo_name, token, data_root_dir) Get a repository by owner, name and token. apply_datetime_format(pd_table, source_column, destination_column=None) Provide equal date formate for all timestamps. get_users(data_root_dir) Get the generated users pandas table. get_users_ids(data_root_dir) Get the generated useres as dict whith github ids as keys and anonym uuids as values. extract_assignees(github_assignees, users_ids, data_root_dir) Get all assignees as one string. extract_labels(github_labels) Get all labels as one string. extract_user_data(user, users_ids, data_root_dir, node_id_to_anonym_uuid=False) Extracting general user data. extract_author_data_from_commit(repo, sha, users_ids, data_root_dir) Extracting general author data from a commit. extract_committer_data_from_commit(repo, sha, users_ids, data_root_dir) Extracting general committer data from a commit. extract_reaction_data(reaction, parent_id, parent_name, users_ids, data_root_dir) Extracting general reaction data. extract_event_data(event, parent_id, parent_name, users_ids, data_root_dir) Extracting general event data from a issue or pull request. extract_comment_data(comment, parent_id, parent_name, users_ids, data_root_dir) Extracting general comment data from a pull request or issue. define_unknown_user(unknown_user_name, uuid, data_root_dir, new_user=False) Defines a unknown user. Add unknown user to alias or creates new user """ USERS = "Users.p" REPO = "Repo.json"
[docs] @staticmethod def check_for_updates(new_list, old_df): """ check_for_updates(new_list, old_df) Check if id and updated_at are in the old_df. Parameters ---------- new_list : list new list with id and updated_at. old_df : DataFrame old Dataframe. Returns ------- bool True if the repo needs to be updated. False the List is uptodate. """ if old_df.empty: if len(new_list) == 0: return False return True if not len(new_list) == old_df.count()[0]: return True for new_class in new_list: df = old_df.loc[((old_df.id == new_class.id) & (old_df.updated_at == new_class.updated_at))] if df.empty: return True return False
[docs] @staticmethod def check_for_updates_paginated(new_paginated_list, old_df): """ check_for_updates_paginated(new_paginated_list, old_df) Check if id and updated_at are in the old_df. Parameters ---------- new_paginated_list : PaginatedList new paginated list with id and updated_at. old_df : DataFrame old Dataframe. Returns ------- bool True if it need to be updated. False the List is uptodate. """ import sys if old_df.empty: # .totalCount crashes in case of a total empty repository try: count = new_paginated_list.totalCount except: return False if count == 0: return False return True if not new_paginated_list.totalCount == old_df.count()[0]: return True for new_class in new_paginated_list: try: df = old_df.loc[((old_df.id == new_class.id) & (old_df.updated_at == new_class.updated_at))] if df.empty: return True except: return False return False
[docs] @staticmethod def save_list_to_pandas_table(dir, file, data_list): """ save_list_to_pandas_table(dir, file, data_list) Save a data list to a pandas table. Parameters ---------- dir : str Path to the desired save dir. file : str Name of the file. data_list : list list of data dictionarys """ Path(dir).mkdir(parents=True, exist_ok=True) data_frame_ = pd.DataFrame(data_list) pd_file = Path(dir, file) with open(pd_file, "wb") as f: pickle.dump(data_frame_, f)
[docs] @staticmethod def get_repo_informations(data_root_dir): """ get_repo_informations(data_root_dir) Get a repository data (owner and name). Parameters ---------- data_root_dir : str Data root directory for the repository. Returns ------- tuple Repository Owner and name """ repo_file = Path(data_root_dir, Utility.REPO) if repo_file.is_file(): with open(repo_file, 'r') as json_file: repo_data = json.load(json_file) return (repo_data["repo_owner"], repo_data["repo_name"]) return None, None
[docs] @staticmethod def get_repos(token, data_root_dir, whitelist_patterns=None, blacklist_patterns=None): """ get_repos(token, data_root_dir, whitelist_patterns=None, blacklist_patterns=None) Get mutiple repositorys by mutiple pattern and token. Parameters ---------- token : str A valid Github Token. data_root_dir : str Data root directory for the repositorys. whitelist_patterns : list the whitelist pattern of the desired repository. blacklist_patterns : list the blacklist pattern of the desired repository. Returns ------- List List of Repository objects from pygithub. Notes ----- PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html """ g = github.Github(token) relevant_repos = [] for repo in g.get_user().get_repos(): whitelist_pass = False if whitelist_patterns == [] or whitelist_patterns == None: whitelist_pass = True else: for whitelist_pattern in whitelist_patterns: if whitelist_pattern in repo.name: whitelist_pass = True break if whitelist_pass: blacklist_pass = True if blacklist_patterns != [] or blacklist_patterns is not None: for blacklist_pattern in blacklist_patterns: if blacklist_pattern in repo.name: blacklist_pass = False break if blacklist_pass: repo_dir = Path(data_root_dir, repo.owner.login + "/" + repo.name) repo_dir.mkdir(parents=True, exist_ok=True) repo_file = Path(repo_dir, Utility.REPO) with open(repo_file, 'w') as json_file: json.dump({"repo_owner": repo.owner.login,"repo_name":repo.name}, json_file) relevant_repos.append(repo) return relevant_repos
[docs] @staticmethod def get_repo(repo_owner, repo_name, token, data_root_dir): """ get_repo(repo_owner, repo_name, token, data_root_dir) Get a repository by owner, name and token. Parameters ---------- repo_owner : str the owner of the desired repository. repo_name : str the name of the desired repository. token : str A valid Github Token. data_root_dir : str Data root directory for the repository. Returns ------- repo Repository object from pygithub. Notes ----- PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html """ g = github.Github(token) data_root_dir.mkdir(parents=True, exist_ok=True) repo_file = Path(data_root_dir, Utility.REPO) with open(repo_file, 'w') as json_file: json.dump({"repo_owner": repo_owner,"repo_name":repo_name}, json_file) return g.get_repo(repo_owner + "/" + repo_name)
[docs] @staticmethod def apply_datetime_format(pd_table, source_column, destination_column=None): """ apply_datetime_format(pd_table, source_column, destination_column=None) Provide equal date formate for all timestamps Parameters ---------- pd_table : pandas Dataframe List of NamedUser source_column : str Source column name. destination_column : str, default=None Destination column name. Saves to Source if None. Returns ------- str String which contains all assignees. """ if not destination_column: destination_column = source_column pd_table[destination_column] = pd.to_datetime(pd_table[source_column], format="%Y-%m-%d %H:%M:%S") return pd_table
[docs] @staticmethod def get_users(data_root_dir): """ get_users(data_root_dir) Get the generated users pandas table. Parameters ---------- data_root_dir : str Data root directory for the repository. Returns ------- DataFrame Pandas DataFrame which includes the users data """ users_file = Path(data_root_dir, Utility.USERS) if users_file.is_file(): return pd.read_pickle(users_file) else: return pd.DataFrame()
[docs] @staticmethod def get_users_ids(data_root_dir): """ get_users_ids(data_root_dir) Get the generated useres as dict whith github ids as keys and anonym uuids as values. Parameters ---------- data_root_dir : str Data root directory for the repository. Returns ------- dict Dict whith github ids as keys and anonym uuids as values. """ df_users = Utility.get_users(data_root_dir) users_ids = {} for index, row in df_users.iterrows(): users_ids[row["id"]] = row["anonym_uuid"] return users_ids
[docs] @staticmethod def extract_assignees(github_assignees, users_ids, data_root_dir): """ extract_assignees(github_assignees, users_ids, data_root_dir) Get all assignees as one string. Parameters ---------- github_assignees : list List of NamedUser. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Data root directory for the repository. Returns ------- str String which contains all assignees and are connected with the char &. Notes ----- PyGithub NamedUser object structure: https://pygithub.readthedocs.io/en/latest/github_objects/NamedUser.html """ assignees = "" for assignee in github_assignees: assignees += Utility.extract_user_data(assignee, users_ids, data_root_dir) + "&" if len(assignees) > 0: assignees = assignees[:-1] return assignees
[docs] @staticmethod def extract_labels(github_labels): """ extract_labels(github_labels) Get all labels as one string. Parameters ---------- github_labels : list List of Label. Returns ------- str String which contains all labels and are connected with the char &. Notes ----- PyGithub Label object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Label.html """ labels = "" for label in github_labels: labels += label.name + "&" if len(labels) > 0: labels = labels[:-1] return labels
[docs] @staticmethod def extract_user_data(user, users_ids, data_root_dir, node_id_to_anonym_uuid=False): """ extract_user_data(user, users_ids, data_root_dir, node_id_to_anonym_uuid=False) Extracting general user data. Parameters ---------- user : NamedUser NamedUser object from pygithub. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Repo dir of the project. node_id_to_anonym_uuid : bool, default=False Node_id will be the anonym_uuid Returns ------- str Anonym uuid of user. Notes ----- PyGithub NamedUser object structure: https://pygithub.readthedocs.io/en/latest/github_objects/NamedUser.html """ if not user: return None if user.node_id in users_ids: return users_ids[user.node_id] users_file = Path(data_root_dir, Utility.USERS) users_df = pd.DataFrame() if users_file.is_file(): users_df = pd.read_pickle(users_file) user_data = {} if node_id_to_anonym_uuid: user_data["anonym_uuid"] = user.node_id else: user_data["anonym_uuid"] = generate_id(seed=user.node_id) user_data["id"] = user.node_id try: user_data["name"] = user.name except: # print("No User name in:") # print(data_root_dir) pass try: user_data["email"] = user.email except: #print("No User email in:") #print(data_root_dir) pass try: user_data["login"] = user.login except: # print("No User login in:") # print(data_root_dir) pass if "login" in user_data: if user_data["login"] == "invalid-email-address" and not "name" in user_data: return None users_ids[user.node_id] = user_data["anonym_uuid"] users_df = users_df.append(user_data, ignore_index=True) with open(users_file, "wb") as f: pickle.dump(users_df, f) return user_data["anonym_uuid"]
[docs] @staticmethod def extract_author_data_from_commit(repo, sha, users_ids, data_root_dir): """ extract_author_data_from_commit(repo, sha, users_ids, data_root_dir) Extracting general author data from a commit. Parameters ---------- repo : Repository Repository object from pygithub. sha : str sha from the commit. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Data root directory for the repository. Returns ------- str Anonym uuid of user. Notes ----- PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html """ if not sha: return None commit = repo.get_commit(sha) if not commit: return None if commit._author == github.GithubObject.NotSet: return None return Utility.extract_user_data(commit.author, users_ids, data_root_dir)
[docs] @staticmethod def extract_committer_data_from_commit(repo, sha, users_ids, data_root_dir): """ extract_committer_data_from_commit(repo, sha, users_ids, data_root_dir) Extracting general committer data from a commit. Parameters ---------- repo : Repository Repository object from pygithub. sha : str sha from the commit. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Data root directory for the repository. Returns ------- str Anonym uuid of user. Notes ----- PyGithub Repository object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html """ if not sha: return None commit = repo.get_commit(sha) if not commit: return None if commit._committer == github.GithubObject.NotSet: return None return Utility.extract_user_data(commit.committer, users_ids, data_root_dir)
[docs] @staticmethod def extract_reaction_data(reaction, parent_id, parent_name, users_ids, data_root_dir): """ extract_reaction_data(reaction, parent_id, parent_name, users_ids, data_root_dir) Extracting general reaction data. Parameters ---------- reaction : Reaction Reaction object from pygithub. parent_id : int Id from parent as foreign key. parent_name : str Name of the parent. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Repo dir of the project. Returns ------- ReactionData Dictionary with the extracted data. Notes ----- Reaction object structure: https://pygithub.readthedocs.io/en/latest/github_objects/Reaction.html """ reaction_data = {} reaction_data[parent_name + "_id"] = parent_id reaction_data["content"] = reaction.content reaction_data["created_at"] = reaction.created_at reaction_data["id"] = reaction.id if not reaction._user == github.GithubObject.NotSet: reaction_data["author"] = Utility.extract_user_data(reaction.user, users_ids, data_root_dir) return reaction_data
[docs] @staticmethod def extract_event_data(event, parent_id, parent_name, users_ids, data_root_dir): """ extract_event_data(event, parent_id, parent_name, users_ids, data_root_dir) Extracting general event data from a issue or pull request. Parameters ---------- even t: IssueEvent IssueEvent object from pygithub. parent_id : int Id from parent as foreign key. parent_name : str Name of the parent. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Repo dir of the project. Returns ------- EventData Dictionary with the extracted data. Notes ----- IssueEvent object structure: https://pygithub.readthedocs.io/en/latest/github_objects/IssueEvent.html """ issue_event_data = {} issue_event_data[parent_name + "_id"] = parent_id if not event._actor == github.GithubObject.NotSet: issue_event_data["author"] = Utility.extract_user_data(event.actor, users_ids, data_root_dir) issue_event_data["commit_sha"] = event.commit_id issue_event_data["created_at"] = event.created_at issue_event_data["event"] = event.event issue_event_data["id"] = event.id if not event._label == github.GithubObject.NotSet: issue_event_data["label"] = event.label.name if not event._assignee == github.GithubObject.NotSet: issue_event_data["assignee"] = Utility.extract_user_data(event.assignee, users_ids, data_root_dir) if not event._assigner == github.GithubObject.NotSet: issue_event_data["assigner"] = Utility.extract_user_data(event.assigner, users_ids, data_root_dir) return issue_event_data
[docs] @staticmethod def extract_comment_data(comment, parent_id, parent_name, users_ids, data_root_dir): """ extract_comment_data(comment, parent_id, parent_name, users_ids, data_root_dir) Extracting general comment data from a pull request or issue. Parameters ---------- comment : github_object PullRequestComment or IssueComment object from pygithub. parent_id : int Id from parent as foreign key. parent_name : str Name of the parent. users_ids : dict Dict of User Ids as Keys and anonym Ids as Value. data_root_dir : str Repo dir of the project. Returns ------- CommentData Dictionary with the extracted data. Notes ----- PullRequestComment object structure: https://pygithub.readthedocs.io/en/latest/github_objects/PullRequestComment.html IssueComment object structure: https://pygithub.readthedocs.io/en/latest/github_objects/IssueComment.html """ comment_data = {} comment_data[parent_name + "_id"] = parent_id comment_data["body"] = comment.body comment_data["created_at"] = comment.created_at comment_data["id"] = comment.id if not comment._user == github.GithubObject.NotSet: comment_data["author"] = Utility.extract_user_data(comment.user, users_ids, data_root_dir) return comment_data
[docs] @staticmethod def define_unknown_user(unknown_user_name, uuid, data_root_dir, new_user=False): """ define_unknown_user(unknown_user_name, uuid, data_root_dir, new_user=False) Defines a unknown user. Add unknown user to alias or creates new user Parameters ---------- unknown_user_name: str Name of unknown user. uuid: str Uuid can be the anonym uuid of another user or random uuid for a new user. data_root_dir : str Data root directory for the repository. new_user : bool, default=False A complete new user with anonym_uuid will be generated. Returns ------- str Uuid of the user. """ users = Utility.get_users(data_root_dir) p_user = users.loc[users.anonym_uuid == uuid] if not p_user.empty: alias = "" user = p_user.iloc[0] if "alias" in user: if pd.isnull(user["alias"]) or (user["alias"] is None): alias = unknown_user_name else: all_alias = user["alias"].split(';') if not unknown_user_name in all_alias: alias = user["alias"] + ";" + unknown_user_name else: alias = user["alias"] else: alias = unknown_user_name users.loc[users.anonym_uuid == uuid, 'alias'] = alias pd_file = Path(data_root_dir, Utility.USERS) with open(pd_file, "wb") as f: pickle.dump(users, f) return user["anonym_uuid"] class UserData: node_id = uuid name = unknown_user_name email = numpy.NaN login = numpy.NaN users_ids = Utility.get_users_ids(data_root_dir) if new_user: return Utility.extract_user_data(UserData(),users_ids,data_root_dir) return Utility.extract_user_data(UserData(),users_ids,data_root_dir, node_id_to_anonym_uuid=True)