executablebooks

executablebooks#

Activity from ‘2022-12-06’ to ‘2023-03-06’

Load data#

Load and clean up the data

from pathlib import Path
path_data = Path("../data")
comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()

for idata in [comments, issues, prs]:
    idata.query("org == @github_org", inplace=True)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[6], line 3
from pathlib import Path
path_data = Path("../data")
----> 3 comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
   else:
       kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
if len(args) > num_allow_args:
   warnings.warn(
       msg.format(arguments=_format_argument_list(allow_args)),
       FutureWarning,
       stacklevel=find_stack_level(),
   )
--> 331 return func(*args, **kwargs)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
kwds_defaults = _refine_defaults_read(
   dialect,
   delimiter,
   (...)
   defaults={"delimiter": ","},
)
kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
_validate_names(kwds.get("names", None))
# Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
if chunksize or iterator:
   return parser

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   self.options["has_index_names"] = kwds["has_index_names"]
self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   if "b" not in mode:
       mode += "b"
-> 1735 self.handles = get_handle(
   f,
   mode,
   encoding=self.options.get("encoding", None),
   compression=self.options.get("compression", None),
   memory_map=self.options.get("memory_map", False),
   is_text=is_text,
   errors=self.options.get("encoding_errors", "strict"),
   storage_options=self.options.get("storage_options", None),
)
assert self.handles is not None
f = self.handles.handle

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
elif isinstance(handle, str):
   # Check whether the filename is to be opened in binary mode.
   # Binary mode does not support 'encoding' and 'newline'.
   if ioargs.encoding and "b" not in ioargs.mode:
       # Encoding
--> 856         handle = open(
           handle,
           ioargs.mode,
           encoding=ioargs.encoding,
           errors=errors,
           newline="",
       )
   else:
       # Binary mode
       handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '../data/comments.csv'

# What are the top N repos, we will only plot these in the full data plots
top_commented_repos = comments.groupby("repo").count().sort_values("createdAt", ascending=False)['createdAt']
use_repos = top_commented_repos.head(top_n_repos).index.tolist()

Merged Pull requests#

Here’s an analysis of merged pull requests across each of the repositories in the Jupyter ecosystem.

prs_by_repo = merged.groupby(['org', 'repo']).count()['author'].reset_index().sort_values(['org', 'author'], ascending=False)
alt.Chart(data=prs_by_repo, title=f"Merged PRs in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=prs_by_repo['repo'].values.tolist()),
    y='author',
    color='org'
)

Authoring and merging stats by repository#

Let’s see who has been doing most of the PR authoring and merging. The PR author is generally the person that implemented a change in the repository (code, documentation, etc). The PR merger is the person that “pressed the green button” and got the change into the main codebase.

charts = []
title = f"PR authors for {github_org} in the last {n_days} days"
this_data = merged_by_repo.replace(np.nan, 0).groupby('username', as_index=False).agg({'authored': 'sum', 'authorAssociation': 'first'})
this_data = this_data.sort_values('authored', ascending=False)
ch = alt.Chart(data=this_data, title=title).mark_bar().encode(
    x='username',
    y='authored',
    color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch

charts = []
title = f"Merges for {github_org} in the last {n_days} days"
ch = alt.Chart(data=closed_by_repo.replace(np.nan, 0), title=title).mark_bar().encode(
    x='username',
    y='closed',
)
ch

Issues#

Issues are conversations that happen on our GitHub repositories. Here’s an analysis of issues across the Jupyter organizations.

created_counts = created.groupby(['org', 'repo']).count()['number'].reset_index()
created_counts['org/repo'] = created_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = created_counts.sort_values(['org', 'number'], ascending=False)['repo'].values
alt.Chart(data=created_counts, title=f"Issues created in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y='number',
)

closed_counts = closed.groupby(['org', 'repo']).count()['number'].reset_index()
closed_counts['org/repo'] = closed_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = closed_counts.sort_values(['number'], ascending=False)['repo'].values
alt.Chart(data=closed_counts, title=f"Issues closed in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y='number',
)

charts = []
# Pick the top 10 repositories
top_repos = created_closed.groupby(['repo']).sum().sort_values(by='count', ascending=False).head(10).index
ch = alt.Chart(created_closed.query('repo in @top_repos'), width=120).mark_bar().encode(
    x=alt.X("kind", axis=alt.Axis(labelFontSize=15, title="")), 
    y=alt.Y('count', axis=alt.Axis(titleFontSize=15, labelFontSize=12)),
    color='kind',
    column=alt.Column("repo", header=alt.Header(title=f"Issue activity, last {n_days} days for {github_org}", titleFontSize=15, labelFontSize=12))
)
ch

time_open = closed.groupby(['org', 'repo']).agg({'time_open': 'median'}).reset_index()
time_open['time_open'] = time_open['time_open'] / (60 * 60 * 24)
time_open['org/repo'] = time_open.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = time_open.sort_values(['org', 'time_open'], ascending=False)['repo'].values
alt.Chart(data=time_open, title=f"Time to close for issues closed in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y=alt.Y('time_open', title="Median Days Open"),
)

Most-upvoted issues#

thumbsup = issues.sort_values("thumbsup", ascending=False).head(25)
thumbsup = thumbsup[["title", "url", "number", "thumbsup", "repo"]]

text = []
for ii, irow in thumbsup.iterrows():
    itext = f"- ({irow['thumbsup']}) {irow['title']} - {irow['repo']} - [#{irow['number']}]({irow['url']})"
    text.append(itext)
text = '\n'.join(text)
HTML(markdown(text))

Commenters across repositories#

These are commenters across all issues and pull requests in the last several days. These are colored by the commenter’s association with the organization. For information about what these associations mean, see this StackOverflow post.

n_plot = 50
charts = []
for ii, (iorg, idata) in enumerate(commentors.groupby(['org'])):
    title = f"Top {n_plot} commentors for {iorg} in the last {n_days} days"
    idata = idata.groupby('author', as_index=False).agg({'count': 'sum', 'authorAssociation': 'first'})
    idata = idata.sort_values('count', ascending=False).head(n_plot)
    ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
        x='author',
        y='count',
        color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
    )
    charts.append(ch)
alt.hconcat(*charts)

First responders#

First responders are the first people to respond to a new issue in one of the repositories. The following plots show first responders for recently-created issues.

n_plot = 50

title = f"Top {n_plot} first responders for {github_org} in the last {n_days} days"
idata = first_responder_counts.groupby('author', as_index=False).agg({'n_first_responses': 'sum', 'authorAssociation': 'first'})
idata = idata.sort_values('n_first_responses', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
    x='author',
    y='n_first_responses',
    color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch

Recent activity#

A list of merged PRs by project#

Below is a tabbed readout of recently-merged PRs. Check out the title to get an idea for what they implemented, and be sure to thank the PR author for their hard work!

tabs = widgets.Tab(children=[])

for ii, ((org, repo), imerged) in enumerate(merged.query("repo in @use_repos").groupby(['org', 'repo'])):
    merged_by = {}
    pr_by = {}
    issue_md = []
    issue_md.append(f"#### Closed PRs for repo: [{org}/{repo}](https://github.com/{github_org}/{repo})")
    issue_md.append("")
    issue_md.append(f"##### ")

    for _, ipr in imerged.iterrows():
        user_name = ipr['author']
        user_url = author_url(user_name)
        pr_number = ipr['number']
        pr_html = ipr['url']
        pr_title = ipr['title']
        pr_closedby = ipr['mergedBy']
        pr_closedby_url = f"https://github.com/{pr_closedby}"
        if user_name not in pr_by:
            pr_by[user_name] = 1
        else:
            pr_by[user_name] += 1

        if pr_closedby not in merged_by:
            merged_by[pr_closedby] = 1
        else:
            merged_by[pr_closedby] += 1
        text = f"* [(#{pr_number})]({pr_html}): _{pr_title}_ by **[@{user_name}]({user_url})** merged by **[@{pr_closedby}]({pr_closedby_url})**"
        issue_md.append(text)
    
    issue_md.append('')
    markdown_html = markdown('\n'.join(issue_md))

    children = list(tabs.children)
    children.append(HTML(markdown_html))
    tabs.children = tuple(children)
    tabs.set_title(ii, repo)
tabs

A list of recent issues#

Below is a list of issues with recent activity in each repository. If they seem of interest to you, click on their links and jump in to participate!

n_plot = 5
tabs = widgets.Tab(children=[])

for ii, (repo, i_issues) in enumerate(comment_counts.query("repo in @use_repos").groupby('repo')):
    
    issue_md = []
    issue_md.append("")
    issue_md.append(f"##### [{github_org}/{repo}](https://github.com/{github_org}/{repo})")

    top_issues = i_issues.sort_values('n_comments', ascending=False).head(n_plot)
    top_issue_list = pd.merge(issues, top_issues, left_on=['org', 'repo', 'id'], right_on=['org', 'repo', 'id'])
    for _, issue in top_issue_list.sort_values('n_comments', ascending=False).head(n_plot).iterrows():
        user_name = issue['author']
        user_url = author_url(user_name)
        issue_number = issue['number']
        issue_html = issue['url']
        issue_title = issue['title']

        text = f"* [(#{issue_number})]({issue_html}): _{issue_title}_ by **[@{user_name}]({user_url})**"
        issue_md.append(text)

    issue_md.append('')
    md_html = HTML(markdown('\n'.join(issue_md)))

    children = list(tabs.children)
    children.append(HTML(markdown('\n'.join(issue_md))))
    tabs.children = tuple(children)
    tabs.set_title(ii, repo)
    
display(Markdown(f"Here are the top {n_plot} active issues in each repository in the last {n_days} days"))
display(tabs)

Here are the top 5 active issues in each repository in the last 90 days