jupyterhub#
Activity from ‘2025-04-08’ to ‘2025-07-07’
Load data#
Load and clean up the data
Merged Pull requests#
Here’s an analysis of merged pull requests across each of the repositories in the Jupyter ecosystem.
prs_by_repo = merged.groupby(['org', 'repo']).count()['login'].reset_index().sort_values(['org', 'login'], ascending=False)
alt.Chart(data=prs_by_repo, title=f"Merged PRs in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=prs_by_repo['repo'].values.tolist()),
y='login',
color='org'
)
Issues#
Issues are conversations that happen on our GitHub repositories. Here’s an analysis of issues across the Jupyter organizations.
created_counts = created.groupby(['org', 'repo']).count()['number'].reset_index()
created_counts['org/repo'] = created_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = created_counts.sort_values(['org', 'number'], ascending=False)['repo'].values
alt.Chart(data=created_counts, title=f"Issues created in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y='number',
)
closed_counts = closed.groupby(['org', 'repo']).count()['number'].reset_index()
closed_counts['org/repo'] = closed_counts.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = closed_counts.sort_values(['number'], ascending=False)['repo'].values
alt.Chart(data=closed_counts, title=f"Issues closed in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y='number',
)
charts = []
# Pick the top 10 repositories
top_repos = created_closed.groupby(['repo']).sum().sort_values(by='count', ascending=False).head(10).index
ch = alt.Chart(created_closed.query('repo in @top_repos'), width=120).mark_bar().encode(
x=alt.X("kind", axis=alt.Axis(labelFontSize=15, title="")),
y=alt.Y('count', axis=alt.Axis(titleFontSize=15, labelFontSize=12)),
color='kind',
column=alt.Column("repo", header=alt.Header(title=f"Issue activity, last {n_days} days for {github_org}", titleFontSize=15, labelFontSize=12))
)
ch
time_open = closed.groupby(['org', 'repo']).agg({'time_open': 'median'}).reset_index()
time_open['time_open'] = time_open['time_open'] / (60 * 60 * 24)
time_open['org/repo'] = time_open.apply(lambda a: a['org'] + '/' + a['repo'], axis=1)
sorted_vals = time_open.sort_values(['org', 'time_open'], ascending=False)['repo'].values
alt.Chart(data=time_open, title=f"Time to close for issues closed in the last {n_days} days").mark_bar().encode(
x=alt.X('repo', sort=alt.Sort(sorted_vals.tolist())),
y=alt.Y('time_open', title="Median Days Open"),
)
Most-upvoted issues#
thumbsup = issues.query("state == 'open'").sort_values("positive", ascending=False).head(25)
thumbsup = thumbsup[["title", "url", "number", "positive", "repo"]]
text = []
for ii, irow in thumbsup.iterrows():
itext = f"- ({irow['positive']}) {irow['title']} - {irow['repo']} - [#{irow['number']}]({irow['url']})"
text.append(itext)
text = '\n'.join(text)
HTML(markdown(text))
Commenters across repositories#
These are commenters across all issues and pull requests in the last several days. These are colored by the commenter’s association with the organization. For information about what these associations mean, see this StackOverflow post.
n_plot = 50
charts = []
for ii, (iorg, idata) in enumerate(commentors.groupby(['org'])):
title = f"Top {n_plot} commentors for {iorg} in the last {n_days} days"
idata = idata.groupby('login', as_index=False).agg({'count': 'sum', 'author_association': 'first'})
idata = idata.sort_values('count', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
x='login',
y='count',
color=alt.Color('author_association', scale=alt.Scale(domain=author_types, range=author_colors))
)
charts.append(ch)
alt.hconcat(*charts)
First responders#
First responders are the first people to respond to a new issue in one of the repositories. The following plots show first responders for recently-created issues.
n_plot = 50
title = f"Top {n_plot} first responders for {github_org} in the last {n_days} days"
idata = first_responder_counts.groupby('login', as_index=False).agg({'n_first_responses': 'sum', 'author_association': 'first'})
idata = idata.sort_values('n_first_responses', ascending=False).head(n_plot)
ch = alt.Chart(data=idata.head(n_plot), title=title).mark_bar().encode(
x='login',
y='n_first_responses',
color=alt.Color('author_association', scale=alt.Scale(domain=author_types, range=author_colors))
)
ch
Recent activity#
A list of merged PRs by project#
Below is a tabbed readout of recently-merged PRs. Check out the title to get an idea for what they implemented, and be sure to thank the PR author for their hard work!
tabs = widgets.Tab(children=[])
for ii, ((org, repo), imerged) in enumerate(merged.query("repo in @use_repos").groupby(['org', 'repo'])):
merged_by = {}
pr_by = {}
issue_md = []
issue_md.append(f"#### Closed PRs for repo: [{org}/{repo}](https://github.com/{github_org}/{repo})")
issue_md.append("")
issue_md.append(f"##### ")
for _, ipr in imerged.iterrows():
user_name = ipr['login']
user_url = author_url(user_name)
pr_number = ipr['number']
pr_html = ipr['url']
pr_title = ipr['title']
pr_closedby = ipr['merged_by']
pr_closedby_url = f"https://github.com/{pr_closedby}"
if user_name not in pr_by:
pr_by[user_name] = 1
else:
pr_by[user_name] += 1
if pr_closedby not in merged_by:
merged_by[pr_closedby] = 1
else:
merged_by[pr_closedby] += 1
text = f"* [(#{pr_number})]({pr_html}): _{pr_title}_ by **[@{user_name}]({user_url})** merged by **[@{pr_closedby}]({pr_closedby_url})**"
issue_md.append(text)
issue_md.append('')
markdown_html = markdown('\n'.join(issue_md))
children = list(tabs.children)
children.append(HTML(markdown_html))
tabs.children = tuple(children)
tabs.set_title(ii, repo)
tabs
A list of recent issues#
Below is a list of issues with recent activity in each repository. If they seem of interest to you, click on their links and jump in to participate!
n_plot = 5
tabs = widgets.Tab(children=[])
for ii, (repo, i_issues) in enumerate(comment_counts.query("repo in @use_repos").groupby('repo')):
issue_md = []
issue_md.append("")
issue_md.append(f"##### [{github_org}/{repo}](https://github.com/{github_org}/{repo})")
top_issues = i_issues.sort_values('n_comments', ascending=False).head(n_plot)
top_issue_list = pd.merge(issues, top_issues, left_on=['org', 'repo', 'id_issue'], right_on=['org', 'repo', 'id_issue'])
for _, issue in top_issue_list.sort_values('n_comments', ascending=False).head(n_plot).iterrows():
user_name = issue['login']
user_url = author_url(user_name)
issue_number = issue['number']
issue_html = issue['url']
issue_title = issue['title']
text = f"* [(#{issue_number})]({issue_html}): _{issue_title}_ by **[@{user_name}]({user_url})**"
issue_md.append(text)
issue_md.append('')
md_html = HTML(markdown('\n'.join(issue_md)))
children = list(tabs.children)
children.append(HTML(markdown('\n'.join(issue_md))))
tabs.children = tuple(children)
tabs.set_title(ii, repo)
display(Markdown(f"Here are the top {n_plot} active issues in each repository in the last {n_days} days"))
display(tabs)
Here are the top 5 active issues in each repository in the last 90 days