# GitHub Repo Report and Analysis ```elixir Mix.install( [ {:req, "~> 0.4.0"}, {:json, "~> 1.4"}, {:instructor, github: "acalejos/instructor_ex"}, {:exterval, "~> 0.2.0"}, {:kino, "~> 0.12.3"} ], config: [ instructor: [ adapter: Instructor.Adapters.OpenAI ], openai: [http_options: [recv_timeout: 10 * 60 * 1000]] ] ) ``` ## Github Searching ```elixir defmodule Report do use Ecto.Schema use Instructor.Validator import Ecto.Changeset import Exterval @score_interval ~i<[0,10]//0.5> @categories [ :software_library, :list_aggregation, :educational_tutorial, :documentation_reference, :configuration_dotfiles, :template_boilerplate, :data_dataset, :tool_utility, :art_design, :hardware_iot, :research_academic, :demo_example, :community_social, :game_development ] @doc """ ## Field Descriptions: - repository_details: A summary of the repository details, including facts such as name, owner, descriptions, number of stargazers, licenseInfo, createdAt, pushedAt, isArchived, primaryLanguage, topics, etc. Your answer should be between 750 to 1000 characters - category: Categorizes the nature and purpose of the GitHub repository. It aids in quickly identifying the type of content or project a repository contains. This field provides vital information for someone determining the relevance and scope of a project. The field accepts predefined atom values representing various types of repositories typically found on GitHub. These categories include: - `:software_library` - Traditional coding projects like applications, frameworks, or libraries. - `:list_aggregation` - Collections of resources such as awesome lists or curated topic lists. - `:educational_tutorial` - Repositories focusing on educational content, tutorials, or learning exercises. - `:documentation_reference` - Projects mainly containing documentation, reference material, or specifications. - `:configuration_dotfiles` - Repositories for configuration files or dotfiles for software/tool setups. - `:template_boilerplate` - Starter templates or boilerplates for kickstarting development. - `:data_dataset` - Repos hosting or linking to datasets for research or analysis. - `:tool_utility` - Repos containing specific tools, scripts, or utilities. - `:art_design` - Projects related to art, graphics, or design resources. - `:hardware_iot` - Repositories about hardware, IoT, or physical computing projects. - `:research_academic` - Academic or scholarly research, papers, or experiments. - `:demo_example` - Example or demo projects illustrating concepts or technologies. - `:community_social` - Repos focused on community building or social initiatives. - `:game_development` - Game development-related projects, engines, or games. - language: The detected natural language (eg. English, Spanish, etc.) of the repository according to any natural language inputs found from fields such as the README, Git Commit Messages, Issue Messages, Repo Description, etc. The description and README should be weighted higher than all other sources. - activity_metrics: A summary of the repository's activity based on factors such as commitComments, issues (history, activity, comments, etc.), pull requests, etc. Your answer should be between 750 to 1000 characters. - community_involvement: A summary of the community involvement based on factors such as forkCount, watchers, contributors, discussions, mergeRequest discussions, etc. Your answer should be between 750 to 1000 characters. - development_practices: A summary of the development practices seen throughout the repo based on factors such as branch_protection_rules, latest_release_info, and other version-control meta controls. Your answer should be between 750 to 1000 characters. - skill_score: Rating from 0 to 10 in increments of 0.5 of how much expertise a new maintainer should have to consider taking on the development related to this repo. 0 is very easy (think Awesome lists which just aggregate other repos, lists, etc.), while 10 is extrmely difficult (think something like an OS kernel). - potential_score: Rating from 0 to 10 in increments of 0.5 of how suitable the repo is for continued development if it were to be re-opened by a current user. Scored the repo on the future potential development. - conclusion: Summarize your analysis of this repository with respect to its technical applicability, potential for future work, and skill required to work with it. Your conclusion should incorporate aspect of every other field, and should serve as a comprehensive summarization of the repository, its community, and its practices. Your answer should be between 1250 to 1500 characters """ @primary_key false embedded_schema do field(:repository_details, :string) field(:category, Ecto.Enum, values: @categories) field(:language, :string) field(:activity_metrics, :string) field(:community_involvement, :string) field(:development_practices, :string) field(:skill_score, :float) field(:potential_score, :float) field(:conclusion, :string) end @impl true def validate_changeset(changeset) do changeset |> validate_inclusion(:skill_score, @score_interval) |> validate_inclusion(:potential_score, @score_interval) end end ``` ```elixir defmodule ReportGenerator do @req Req.new(base_url: "https://api.github.com/graphql") def cleanup_nodes(nodes) do max_readme_length = 300 issueCharLimit = 300 nodes |> Enum.map(fn node -> {topics, related_topics} = node |> get_in(["repositoryTopics", "nodes"]) |> Enum.reduce( {[], []}, fn %{"topic" => %{"name" => topic_name, "relatedTopics" => topic_related}}, {topics_acc, related_acc} -> topic_related = Enum.reduce(topic_related, [], fn curr, acc -> if Map.has_key?(curr, "name"), do: [Map.fetch!(curr, "name") | acc], else: acc end) {[topic_name | topics_acc], related_acc ++ topic_related} end ) %{"totalCount" => openIssueCount, "nodes" => openIssues} = Map.get(node, "openIssues") %{"totalCount" => closedIssueCount, "nodes" => closedIssues} = Map.get(node, "closedIssues") [closedIssues, openIssues] = Enum.zip_reduce([openIssues, closedIssues], [[], []], fn [ %{"bodyText" => openText} = openIssue, %{"bodyText" => closedText} = closedIssue ], [openAcc, closedAcc] -> openIssue = Map.replace!(openIssue, "bodyText", openText |> String.slice(0..(issueCharLimit - 1))) closedIssue = Map.replace!( closedIssue, "bodyText", closedText |> String.slice(0..(issueCharLimit - 1)) ) [[openIssue | openAcc], [closedIssue | closedAcc]] end) readme = (Map.get(node, "readmeMD") || Map.get(node, "readme") || Map.get(node, "readmeLowerMD")) |> Map.update("text", "", fn text -> text |> String.split(" ", trim: true) |> Enum.slice(0..max_readme_length) |> Enum.join(" ") end) node |> Map.replace!("repositoryTopics", topics) |> Map.put_new("relatedTopics", related_topics) |> Map.replace!("readme", readme) |> Map.reject(fn {k, _v} -> k in ["readmeMD", "readmeLowerMD"] end) |> Map.replace!("openIssues", openIssues) |> Map.replace!("closedIssues", closedIssues) |> Map.put_new("openIssueCount", openIssueCount) |> Map.put_new("closedIssueCount", closedIssueCount) end) end def run_query(repoOwner, repoName, gh_tok) do query = """ query getRepoReport($repoName: String!, $repoOwner: String!) { repository(name: $repoName, owner: $repoOwner) { commitComments(after: null, first: 5) { nodes { bodyText } } squashMergeAllowed squashMergeCommitTitle squashMergeCommitMessage webCommitSignoffRequired openPRs: pullRequests(first: 5, after: null, states: OPEN) { totalCount nodes { totalCommentsCount merged participants(first: 100, after: null) { totalCount } } } closedPRs: pullRequests(first: 5, after: null, states: CLOSED) { totalCount nodes { totalCommentsCount merged participants(first: 100, after: null) { totalCount } } } deleteBranchOnMerge deployments(first: 5, after: null) { nodes { task payload } } discussions(first: 5, after: null) { nodes { answer { minimizedReason } labels { nodes { name } } } } latestRelease { name tagName createdAt updatedAt } mergeCommitAllowed openIssues: issues(states: OPEN, first: 10, after: null) { totalCount nodes { title bodyText } } closedIssues: issues(states: CLOSED, first: 5, after: null) { totalCount nodes { title bodyText } } isLocked forkCount forkingAllowed hasDiscussionsEnabled hasIssuesEnabled hasVulnerabilityAlertsEnabled hasWikiEnabled isFork isSecurityPolicyEnabled readmeMD: object(expression: "HEAD:README.md") { ... on Blob { text } } readme: object(expression: "HEAD:README") { ... on Blob { text } } readmeLowerMD: object(expression: "HEAD:readme.md") { ... on Blob { text } } name primaryLanguage { name } contactLinks { name about url } createdAt description updatedAt stargazerCount homepageUrl url owner { avatarUrl login url } openGraphImageUrl pushedAt repositoryTopics(after: null, first: 5) { totalCount nodes { topic { name relatedTopics { name } } } } } } """ variables = %{"repoName" => repoName, "repoOwner" => repoOwner} Req.post( @req, headers: %{"Authorization" => "bearer #{gh_tok}"}, json: %{"query" => query, "variables" => variables} ) end def get_report(gql_response, oai_token, model) do Instructor.chat_completion( model: model, response_model: Report, max_retries: 3, messages: [ %{ role: "user", content: """ I'm making a website that showcases Github repos that have been abandoned, but have a lot of stars and permissive licenses. This is to allow people to look for potential projects that they can fork, take over, or use freely. I want to have premium Reports that people can look at that summarize some of the history of the project. Im using the GitHub Graphql API. What fields and/or items should I use from the API to help synthesize these reports? My idea is to generate these reports using an LLM that is given these items from the API and generates the reports according to a predefined structure. Your job is to write these reports in a professional, but not stiff manner. Assume that people are paying a premium for these reports, so they must provide good insight based on the given information from the GitHub GraphQL API. These results should be presentable as a dossier about the repository in question, and should make conclusions derived from the available data. The conclusions should be well thought out, and not simply restating the facts. Here is the result of the GrapqhQL Response from Github: #{Jason.encode!(gql_response)} """ } ], config: %OpenAI.Config{ http_options: [recv_timeout: 10 * 60 * 1000], api_key: oai_token } ) end def generate_report(path, gh_tok, oai_tok, model) do case String.split(path, "/") do [owner, repo] -> case run_query(owner, repo, gh_tok) do {:ok, %Req.Response{status: 200, body: body}} -> get_report(body["data"], oai_tok, model) {:ok, %Req.Response{body: body}} -> {:error, "Something went wrong when communicating with the GitHub API. #{body["message"]}."} {:error, _exception} -> {:error, "Something went wrong when communicating with the GitHub API. Try again later."} end _ -> {:error, "Repo must be in the form own 'owner/repo', got #{inspect(path)}"} end end end ``` ```elixir defmodule LiveReport do use Kino.JS use Kino.JS.Live @empty_state """ Report """ def new() do Kino.JS.Live.new(__MODULE__, nil) end def replace(kino, report, origin) do Kino.JS.Live.call(kino, {:replace, report, origin}) end def loading(kino, origin) do Kino.JS.Live.call(kino, {:replace, :loading, origin}) end def download_html(kino) do Kino.JS.Live.call(kino, {:download}) end defp format_category(category) do category |> Atom.to_string() |> String.split("_") |> Enum.map(&String.capitalize/1) |> Enum.join(" ") end @impl true def init(_report, ctx) do {:ok, assign(ctx, html: @empty_state)} end @impl true def handle_connect(ctx) do {:ok, @empty_state, ctx} end @impl true def handle_call({:download}, _from, ctx) do {:reply, ctx.html, nil} end def handle_call({:replace, %Report{} = report, origin}, _from, ctx) do report_html = """ Report

Project Report

Detailed analysis and insights

Repository Details
#{report.repository_details}
Category
#{format_category(report.category)}
Language
#{report.language}
Activity Metrics
#{report.activity_metrics}
Community Involvement
#{report.community_involvement}
Development Practices
#{report.development_practices}
Skill Score
Potential Score
Conclusion
#{report.conclusion}
""" send_event(ctx, origin, "replace", report_html) {:reply, report_html, assign(ctx, html: report_html)} end def handle_call({:replace, report, origin}, _from, ctx) do report_html = case report do nil -> @empty_state {:error, reason} -> """ Error Page

Oops! Something went wrong.

#{reason}

""" :loading -> """ Report
""" end send_event(ctx, origin, "replace", report_html) {:reply, report_html, assign(ctx, html: report_html)} end asset "main.js" do """ export function init(ctx, html) { ctx.root.innerHTML = html; ctx.handleEvent("replace", (html) => { console.log(html); ctx.root.innerHTML = html; }); } """ end end ``` ```elixir live_report = LiveReport.new() instructions = Kino.Markdown.new(""" Welcome! This demo showcases using LLMs to generate automated smart reports. Here, we use the GitHub API + OpenAI to analyze your repository and give you a downloadable report with insights about its suitability for open-source contributions and an overall summary! Get started by following the steps below: 1. Enter your GitHub API Token. This should be a Classic Token with `repo` scope. [See Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#personal-access-tokens-classic) 1. Enter your OpenAI API Token. [See Docs](https://platform.openai.com/docs/quickstart/step-2-setup-your-api-key) 1. Enter the name of the OpenAI model you would like to use [See Available Models](https://platform.openai.com/docs/models/models) (requires authentication) * I suggest `gpt-4`, as it is likely that that the request will exceed the context window limit of `gpt-3.5-turbo` If you liked this demo, consider giving a star to the wonderful libraries that helped power this app: * [Livebook](https://github.com/livebook-dev/livebook) / [Kino](https://github.com/livebook-dev/kino/) * [Instructor](https://github.com/thmsmlr/instructor_ex) * [Req](https://github.com/wojtekmach/req) * [OpenAI Elixir Wrapper](https://github.com/mgallo/openai.ex) --- You can also follow my work on [GitHub](https://github.com/acalejos) You can read more long-form articles at [The Stack Canary](https://www.thestackcanary.com) ![X (formerly Twitter) Follow](https://img.shields.io/twitter/follow/ac_alejos) --- """) form = Kino.Control.form( [ github_token: Kino.Input.text("GitHub API Key"), openai_token: Kino.Input.text("OpenAI API Key"), model: Kino.Input.text("OpenAI Model"), repo: Kino.Input.text("GitHub Repo (owner/repo)") ], submit: "Generate" ) frame = Kino.Frame.new() Kino.listen(form, fn %{ data: %{repo: repo, github_token: gh_tok, openai_token: oai_tok, model: model}, origin: origin } -> LiveReport.loading(live_report, origin) new_html = cond do repo == "" -> LiveReport.replace(live_report, {:error, "Please specify a GitHub Repo"}, origin) gh_tok == "" -> LiveReport.replace(live_report, {:error, "Please specify a GitHub Token"}, origin) oai_tok == "" -> LiveReport.replace(live_report, {:error, "Please specify an OpenAI Token"}, origin) model == "" -> LiveReport.replace(live_report, {:error, "Please specify an OpenAI model"}, origin) true -> case repo |> ReportGenerator.generate_report(gh_tok, oai_tok, model) do {:ok, report} -> LiveReport.replace(live_report, report, origin) {:error, reason} -> cond do String.contains?(reason, "invalid_api_key") -> LiveReport.replace(live_report, {:error, "Invalid OpenAI API Key"}, origin) String.contains?(reason, "invalid model ID") or String.contains?(reason, "model_not_found") -> LiveReport.replace(live_report, {:error, "Invalid OpenAI Model"}, origin) true -> LiveReport.replace(live_report, {:error, reason}, origin) end end end Kino.Frame.render(frame, Kino.Text.new(new_html, terminal: false)) end) download = Kino.Download.new( fn -> Kino.Frame.get_outputs(frame) |> hd() |> Map.get(:text) end, label: "Download Report", filename: "report.html" ) Kino.Layout.grid([instructions, form, live_report, download], columns: 1, gap: 4, boxed: true) ```