executablebooks#

Activity from ‘2022-12-06’ to ‘2023-03-06’

Load data#

Load and clean up the data

from pathlib import Path
path_data = Path("../data")
comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()

for idata in [comments, issues, prs]:
    idata.query("org == @github_org", inplace=True)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[6], line 3
      1 from pathlib import Path
      2 path_data = Path("../data")
----> 3 comments = pd.read_csv(path_data.joinpath('comments.csv'), index_col=None).drop_duplicates()
      4 issues = pd.read_csv(path_data.joinpath('issues.csv'), index_col=None).drop_duplicates()
      5 prs = pd.read_csv(path_data.joinpath('prs.csv'), index_col=None).drop_duplicates()

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File /opt/hostedtoolcache/Python/3.9.16/x64/lib/python3.9/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    851 elif isinstance(handle, str):
    852     # Check whether the filename is to be opened in binary mode.
    853     # Binary mode does not support 'encoding' and 'newline'.
    854     if ioargs.encoding and "b" not in ioargs.mode:
    855         # Encoding
--> 856         handle = open(
    857             handle,
    858             ioargs.mode,
    859             encoding=ioargs.encoding,
    860             errors=errors,
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
    865         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '../data/comments.csv'
# What are the top N repos, we will only plot these in the full data plots
top_commented_repos = comments.groupby("repo").count().sort_values("createdAt", ascending=False)['createdAt']
use_repos = top_commented_repos.head(top_n_repos).index.tolist()

Merged Pull requests#

Here’s an analysis of merged pull requests across each of the repositories in the Jupyter ecosystem.

prs_by_repo = merged.groupby(['org', 'repo']).count()['author'].reset_index().sort_values(['org', 'author'], ascending=False)
alt.Chart(data=prs_by_repo, title=f"Merged PRs in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=prs_by_repo['repo'].values.tolist()),
    y='author',
    color='org'
)

Authoring and merging stats by repository#

Let’s see who has been doing most of the PR authoring and merging. The PR author is generally the person that implemented a change in the repository (code, documentation, etc). The PR merger is the person that “pressed the green button” and got the change into the main codebase.

charts = []
title = f"PR authors for {github_org} in the last {n_days} days"
this_data = merged_by_repo.replace(np.nan, 0).groupby('username', as_index=False).agg({'authored': 'sum', 'authorAssociation': 'first'})
this_data = this_data.sort_values('authored', ascending=False)
ch = alt.Chart(data=this_data, title=title).mark_bar().encode(
    x='username',
    y='authored',
    color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch
charts = []
title = f"Merges for {github_org} in the last {n_days} days"
ch = alt.Chart(data=closed_by_repo.replace(np.nan, 0), title=title).mark_bar().encode(
    x='username',
    y='closed',
)
ch

Issues#

Issues are conversations that happen on our GitHub repositories. Here’s an analysis of issues across the Jupyter organizations.

created_counts = created.groupby(['org', 'repo']).count()['number'].reset_index()
created_counts['org/repo'] = created_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = created_counts.sort_values(['org', 'number'], ascending=False)['repo'].values
alt.Chart(data=created_counts, title=f"Issues created in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y='number',
)
closed_counts = closed.groupby(['org', 'repo']).count()['number'].reset_index()
closed_counts['org/repo'] = closed_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = closed_counts.sort_values(['number'], ascending=False)['repo'].values
alt.Chart(data=closed_counts, title=f"Issues closed in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y='number',
)
charts = []
# Pick the top 10 repositories
top_repos = created_closed.groupby(['repo']).sum().sort_values(by='count', ascending=False).head(10).index
ch = alt.Chart(created_closed.query('repo in @top_repos'), width=120).mark_bar().encode(
    x=alt.X("kind", axis=alt.Axis(labelFontSize=15, title="")), 
    y=alt.Y('count', axis=alt.Axis(titleFontSize=15, labelFontSize=12)),
    color='kind',
    column=alt.Column("repo", header=alt.Header(title=f"Issue activity, last {n_days} days for {github_org}", titleFontSize=15, labelFontSize=12))
)
ch
time_open = closed.groupby(['org', 'repo']).agg({'time_open': 'median'}).reset_index()
time_open['time_open'] = time_open['time_open'] / (60 * 60 * 24)
time_open['org/repo'] = time_open.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = time_open.sort_values(['org', 'time_open'], ascending=False)['repo'].values
alt.Chart(data=time_open, title=f"Time to close for issues closed in the last {n_days} days").mark_bar().encode(
    x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
    y=alt.Y('time_open', title="Median Days Open"),
)

Most-upvoted issues#

thumbsup = issues.sort_values("thumbsup", ascending=False).head(25)
thumbsup = thumbsup[["title", "url", "number", "thumbsup", "repo"]]

text = []
for ii, irow in thumbsup.iterrows():
    itext = f"- ({irow['thumbsup']}) {irow['title']} - {irow['repo']} - [#{irow['number']}]({irow['url']})"
    text.append(itext)
text = '\n'.join(text)
HTML(markdown(text))

Commenters across repositories#

These are commenters across all issues and pull requests in the last several days. These are colored by the commenter’s association with the organization. For information about what these associations mean, see this StackOverflow post.

n_plot = 50
charts = []
for ii, (iorg, idata) in enumerate(commentors.groupby(['org'])):
    title = f"Top {n_plot} commentors for {iorg} in the last {n_days} days"
    idata = idata.groupby('author', as_index=False).agg({'count': 'sum', 'authorAssociation': 'first'})
    idata = idata.sort_values('count', ascending=False).head(n_plot)
    ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
        x='author',
        y='count',
        color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
    )
    charts.append(ch)
alt.hconcat(*charts)

First responders#

First responders are the first people to respond to a new issue in one of the repositories. The following plots show first responders for recently-created issues.

n_plot = 50

title = f"Top {n_plot} first responders for {github_org} in the last {n_days} days"
idata = first_responder_counts.groupby('author', as_index=False).agg({'n_first_responses': 'sum', 'authorAssociation': 'first'})
idata = idata.sort_values('n_first_responses', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
    x='author',
    y='n_first_responses',
    color=alt.Color('authorAssociation', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch

Recent activity#

A list of merged PRs by project#

Below is a tabbed readout of recently-merged PRs. Check out the title to get an idea for what they implemented, and be sure to thank the PR author for their hard work!

tabs = widgets.Tab(children=[])

for ii, ((org, repo), imerged) in enumerate(merged.query("repo in @use_repos").groupby(['org', 'repo'])):
    merged_by = {}
    pr_by = {}
    issue_md = []
    issue_md.append(f"#### Closed PRs for repo: [{org}/{repo}](https://github.com/{github_org}/{repo})")
    issue_md.append("")
    issue_md.append(f"##### ")

    for _, ipr in imerged.iterrows():
        user_name = ipr['author']
        user_url = author_url(user_name)
        pr_number = ipr['number']
        pr_html = ipr['url']
        pr_title = ipr['title']
        pr_closedby = ipr['mergedBy']
        pr_closedby_url = f"https://github.com/{pr_closedby}"
        if user_name not in pr_by:
            pr_by[user_name] = 1
        else:
            pr_by[user_name] += 1

        if pr_closedby not in merged_by:
            merged_by[pr_closedby] = 1
        else:
            merged_by[pr_closedby] += 1
        text = f"* [(#{pr_number})]({pr_html}): _{pr_title}_ by **[@{user_name}]({user_url})** merged by **[@{pr_closedby}]({pr_closedby_url})**"
        issue_md.append(text)
    
    issue_md.append('')
    markdown_html = markdown('\n'.join(issue_md))

    children = list(tabs.children)
    children.append(HTML(markdown_html))
    tabs.children = tuple(children)
    tabs.set_title(ii, repo)
tabs

A list of recent issues#

Below is a list of issues with recent activity in each repository. If they seem of interest to you, click on their links and jump in to participate!

n_plot = 5
tabs = widgets.Tab(children=[])

for ii, (repo, i_issues) in enumerate(comment_counts.query("repo in @use_repos").groupby('repo')):
    
    issue_md = []
    issue_md.append("")
    issue_md.append(f"##### [{github_org}/{repo}](https://github.com/{github_org}/{repo})")

    top_issues = i_issues.sort_values('n_comments', ascending=False).head(n_plot)
    top_issue_list = pd.merge(issues, top_issues, left_on=['org', 'repo', 'id'], right_on=['org', 'repo', 'id'])
    for _, issue in top_issue_list.sort_values('n_comments', ascending=False).head(n_plot).iterrows():
        user_name = issue['author']
        user_url = author_url(user_name)
        issue_number = issue['number']
        issue_html = issue['url']
        issue_title = issue['title']

        text = f"* [(#{issue_number})]({issue_html}): _{issue_title}_ by **[@{user_name}]({user_url})**"
        issue_md.append(text)

    issue_md.append('')
    md_html = HTML(markdown('\n'.join(issue_md)))

    children = list(tabs.children)
    children.append(HTML(markdown('\n'.join(issue_md))))
    tabs.children = tuple(children)
    tabs.set_title(ii, repo)
    
display(Markdown(f"Here are the top {n_plot} active issues in each repository in the last {n_days} days"))
display(tabs)

Here are the top 5 active issues in each repository in the last 90 days