nteract#
Activity from ‘2022-12-06’ to ‘2023-03-06’
Load data#
Load and clean up the data
from pathlib import Path
path_data = Path("../data")
comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()
for idata in [comments, issues, prs]:
idata.query("org == @github_org", inplace=True)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[6], line 3
1 from pathlib import Path
2 path_data = Path("../data")
----> 3 comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
4 issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
5 prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
851 elif isinstance(handle, str):
852 # Check whether the filename is to be opened in binary mode.
853 # Binary mode does not support 'encoding' and 'newline'.
854 if ioargs.encoding and "b" not in ioargs.mode:
855 # Encoding
--> 856 handle = open(
857 handle,
858 ioargs.mode,
859 encoding=ioargs.encoding,
860 errors=errors,
861 newline="",
862 )
863 else:
864 # Binary mode
865 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: '../data/comments.csv'
# What are the top N repos, we will only plot these in the full data plots
top_commented_repos = comments.groupby("repo").count().sort_values("createdAt", ascending=False)['createdAt']
use_repos = top_commented_repos.head(top_n_repos).index.tolist()
Merged Pull requests#
Here’s an analysis of merged pull requests across each of the repositories in the Jupyter ecosystem.
prs_by_repo = merged.groupby(['org', 'repo']).count()['author'].reset_index().sort_values(['org', 'author'], ascending=False)
alt.Chart(data=prs_by_repo, title=f"Merged PRs in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=prs_by_repo['repo'].values.tolist()),
y='author',
color='org'
)
Issues#
Issues are conversations that happen on our GitHub repositories. Here’s an analysis of issues across the Jupyter organizations.
created_counts = created.groupby(['org', 'repo']).count()['number'].reset_index()
created_counts['org/repo'] = created_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = created_counts.sort_values(['org', 'number'], ascending=False)['repo'].values
alt.Chart(data=created_counts, title=f"Issues created in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y='number',
)
closed_counts = closed.groupby(['org', 'repo']).count()['number'].reset_index()
closed_counts['org/repo'] = closed_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = closed_counts.sort_values(['number'], ascending=False)['repo'].values
alt.Chart(data=closed_counts, title=f"Issues closed in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y='number',
)
charts = []
# Pick the top 10 repositories
top_repos = created_closed.groupby(['repo']).sum().sort_values(by='count', ascending=False).head(10).index
ch = alt.Chart(created_closed.query('repo in @top_repos'), width=120).mark_bar().encode(
x=alt.X("kind", axis=alt.Axis(labelFontSize=15, title="")),
y=alt.Y('count', axis=alt.Axis(titleFontSize=15, labelFontSize=12)),
color='kind',
column=alt.Column("repo", header=alt.Header(title=f"Issue activity, last {n_days} days for {github_org}", titleFontSize=15, labelFontSize=12))
)
ch
time_open = closed.groupby(['org', 'repo']).agg({'time_open': 'median'}).reset_index()
time_open['time_open'] = time_open['time_open'] / (60 * 60 * 24)
time_open['org/repo'] = time_open.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = time_open.sort_values(['org', 'time_open'], ascending=False)['repo'].values
alt.Chart(data=time_open, title=f"Time to close for issues closed in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y=alt.Y('time_open', title="Median Days Open"),
)
Most-upvoted issues#
thumbsup = issues.sort_values("thumbsup", ascending=False).head(25)
thumbsup = thumbsup[["title", "url", "number", "thumbsup", "repo"]]
text = []
for ii, irow in thumbsup.iterrows():
itext = f"- ({irow['thumbsup']}) {irow['title']} - {irow['repo']} - [#{irow['number']}]({irow['url']})"
text.append(itext)
text = '\n'.join(text)
HTML(markdown(text))
Commenters across repositories#
These are commenters across all issues and pull requests in the last several days. These are colored by the commenter’s association with the organization. For information about what these associations mean, see this StackOverflow post.
n_plot = 50
charts = []
for ii, (iorg, idata) in enumerate(commentors.groupby(['org'])):
title = f"Top {n_plot} commentors for {iorg} in the last {n_days} days"
idata = idata.groupby('author', as_index=False).agg({'count': 'sum', 'authorAssociation': 'first'})
idata = idata.sort_values('count', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
x='author',
y='count',
color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
charts.append(ch)
alt.hconcat(*charts)
First responders#
First responders are the first people to respond to a new issue in one of the repositories. The following plots show first responders for recently-created issues.
n_plot = 50
title = f"Top {n_plot} first responders for {github_org} in the last {n_days} days"
idata = first_responder_counts.groupby('author', as_index=False).agg({'n_first_responses': 'sum', 'authorAssociation': 'first'})
idata = idata.sort_values('n_first_responses', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
x='author',
y='n_first_responses',
color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch
Recent activity#
A list of merged PRs by project#
Below is a tabbed readout of recently-merged PRs. Check out the title to get an idea for what they implemented, and be sure to thank the PR author for their hard work!
tabs = widgets.Tab(children=[])
for ii, ((org, repo), imerged) in enumerate(merged.query("repo in @use_repos").groupby(['org', 'repo'])):
merged_by = {}
pr_by = {}
issue_md = []
issue_md.append(f"#### Closed PRs for repo: [{org}/{repo}](https://github.com/{github_org}/{repo})")
issue_md.append("")
issue_md.append(f"##### ")
for _, ipr in imerged.iterrows():
user_name = ipr['author']
user_url = author_url(user_name)
pr_number = ipr['number']
pr_html = ipr['url']
pr_title = ipr['title']
pr_closedby = ipr['mergedBy']
pr_closedby_url = f"https://github.com/{pr_closedby}"
if user_name not in pr_by:
pr_by[user_name] = 1
else:
pr_by[user_name] += 1
if pr_closedby not in merged_by:
merged_by[pr_closedby] = 1
else:
merged_by[pr_closedby] += 1
text = f"* [(#{pr_number})]({pr_html}): _{pr_title}_ by **[@{user_name}]({user_url})** merged by **[@{pr_closedby}]({pr_closedby_url})**"
issue_md.append(text)
issue_md.append('')
markdown_html = markdown('\n'.join(issue_md))
children = list(tabs.children)
children.append(HTML(markdown_html))
tabs.children = tuple(children)
tabs.set_title(ii, repo)
tabs
A list of recent issues#
Below is a list of issues with recent activity in each repository. If they seem of interest to you, click on their links and jump in to participate!
n_plot = 5
tabs = widgets.Tab(children=[])
for ii, (repo, i_issues) in enumerate(comment_counts.query("repo in @use_repos").groupby('repo')):
issue_md = []
issue_md.append("")
issue_md.append(f"##### [{github_org}/{repo}](https://github.com/{github_org}/{repo})")
top_issues = i_issues.sort_values('n_comments', ascending=False).head(n_plot)
top_issue_list = pd.merge(issues, top_issues, left_on=['org', 'repo', 'id'], right_on=['org', 'repo', 'id'])
for _, issue in top_issue_list.sort_values('n_comments', ascending=False).head(n_plot).iterrows():
user_name = issue['author']
user_url = author_url(user_name)
issue_number = issue['number']
issue_html = issue['url']
issue_title = issue['title']
text = f"* [(#{issue_number})]({issue_html}): _{issue_title}_ by **[@{user_name}]({user_url})**"
issue_md.append(text)
issue_md.append('')
md_html = HTML(markdown('\n'.join(issue_md)))
children = list(tabs.children)
children.append(HTML(markdown('\n'.join(issue_md))))
tabs.children = tuple(children)
tabs.set_title(ii, repo)
display(Markdown(f"Here are the top {n_plot} active issues in each repository in the last {n_days} days"))
display(tabs)
Here are the top 5 active issues in each repository in the last 90 days