In [14]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os
import requests
from dataclasses import dataclass
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
import logging
from functools import lru_cache
In [2]:
@dataclass(repr=True, frozen=True)
class LLMCategorisationResponse:
    bool_response: bool
    reasoning: str
In [3]:
required_secrets = ["GEN_AI_API_VERSION", "GEN_AI_API_ENDPOINT", "GEN_AI_API_KEY", "GEN_AI_MODEL_VERSION"]
assert load_dotenv(), "No .env found, please place a .env file in current folder with secrets: " ",".join(required_secrets)
In [4]:
logger = logging.getLogger("AIAct")
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
In [5]:
for secret in required_secrets:
    globals()[secret] = os.getenv(secret)
In [6]:
client = AzureOpenAI(
    api_version=GEN_AI_API_VERSION,
    azure_endpoint=GEN_AI_API_ENDPOINT,
    api_key=GEN_AI_API_KEY,
    azure_deployment=GEN_AI_MODEL_VERSION
)
In [15]:
@lru_cache
def get_markdown_from_gh_repo(repo_url: str) -> str: 
    """Converts given publicly accessible repo to LLM friendly markdown"""
    gitingest_endpoint = f"https://gitingest.com/{repo_url}"
    repo_url = f"https://github.com/{repo_url}"
    form_data = {
        "input_text": repo_url,
        "max_file_size": 243, 
        "pattern_type": "exclude",
        "pattern": ""
    }
    resp = requests.post(gitingest_endpoint, data=form_data)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text)
    return soup.find("textarea", {"class": "result-text"})
In [20]:
def _cast_xml_to_type(xml_content: str) -> 'LLMCategorisationResponse': 
    xml_obj = ET.fromstring(xml_content)
    bool_resp = True if xml_obj.find("bool_response").text.lower() == "true" else False
    reasoning = xml_obj.find("reasoning").text
    return LLMCategorisationResponse(bool_response=bool_resp, reasoning=reasoning)
In [21]:
@lru_cache
def does_project_use_llm(repo_url: str) -> 'LLMCategorisationResponse':
    """Check using AI if repo uses LLM
    :param repo_url: Relative path of URL in format <username>/<repo_name> of a publicly accessible GH repo 
    :returns: LLMCategorisationResponse
    """ 
    sys_prompt = """you are a classification agent which helps determine if a git repository makes use of AI (LLM or other machine learning technology).env

    //Analyse file system structure and go through provided README and other informations to create an in-depth response and reason your output.

    //Output Format: 
    Output in valid XML-Structure using extra reasoning tokens to create a proper and comprehensive reasoning following OpenAI reasoning guidelines.

    Return pure XML only part without any code blocks or xml annotation infront.
    
    Following will be your output schema definition: 
    <result>
    <bool_response> {True or False depending on if project makes use of AI} </bool_response>
    <reasoning>{Reason your decision in comprehensive natural language in plain english}</reasoning>
    </result>
    
    Example Output:
    <result>
    <bool_response>True</bool_response>
    <reasoning>Repository uses AI because in Python File myrepo.analyse.categorize.py it makes call to BERT text categorization model.</reasoning>
    </result>
    
    """
    logger.debug("Parsing Repository content into Markdown...")
    repo_text_content = get_markdown_from_gh_repo(repo_url).text
    logger.debug("Using AI to analyse")
    repo_text_content = repo_text_content[:2000] # cap at 2000 chars because of token limit
    ai_response = client.chat.completions.create(model="gpt-4o", messages=[{"role": "system", "type": "text", "content": sys_prompt}, {"role": "user", "content": repo_text_content}]).choices[0].message.content.strip()
    return _cast_xml_to_type(ai_response)
In [22]:
def e2e_evaluation(repo): 
    ai_answer = does_project_use_llm(repo)
    logger.info(f"AI System says the project `{repo}` {'uses' if ai_answer.bool_response else 'does not use'} AI.")
    logger.info(f"Full reasoning: {ai_answer.reasoning}")
In [23]:
projects = ["regmibijay/strato-dyndns", "pymupdf/PyMuPDF", "onyx-dot-app/onyx", "sinaptik-ai/pandas-ai"]
for project in projects: 
    logger.info(f"Evaluating [{project}]")
    e2e_evaluation(project)
2025-01-11 11:51:50,443 - AIAct - INFO - Evaluating [regmibijay/strato-dyndns]
2025-01-11 11:51:51,547 - AIAct - INFO - AI System says the project `regmibijay/strato-dyndns` does not use AI.
2025-01-11 11:51:51,548 - AIAct - INFO - Full reasoning: The project "Strato-DynDNS" is a tool for updating DNS records for websites on DNS servers, specifically designed for use with Strato and Namecheap. The README does not mention any use of AI, machine learning technology, or LLMs. The provided information focuses on DNS updates, installation, and usage instructions, which are not related to AI.
2025-01-11 11:51:51,550 - AIAct - INFO - Evaluating [pymupdf/PyMuPDF]
2025-01-11 11:51:52,502 - AIAct - INFO - AI System says the project `pymupdf/PyMuPDF` does not use AI.
2025-01-11 11:51:52,503 - AIAct - INFO - Full reasoning: The repository PyMuPDF focuses on providing Python bindings and abstractions for data extraction, analysis, conversion, and manipulation of various document types including PDFs. It does not mention or leverage any machine learning technologies or AI models in its described functionalities or optional features. It primarily supports direct document processing and OCR through Tesseract, which is not an AI model but a traditional OCR engine.
2025-01-11 11:51:52,504 - AIAct - INFO - Evaluating [onyx-dot-app/onyx]
2025-01-11 11:51:53,764 - AIAct - INFO - AI System says the project `onyx-dot-app/onyx` uses AI.
2025-01-11 11:51:53,766 - AIAct - INFO - Full reasoning: Repository uses AI because the README explicitly states that Onyx is an AI Assistant that connects to company documents, apps, and people, and it plugs into any LLM (Large Language Model) of your choice. This indicates the use of machine learning technology, specifically LLM.
2025-01-11 11:51:53,767 - AIAct - INFO - Evaluating [sinaptik-ai/pandas-ai]
2025-01-11 11:51:54,780 - AIAct - INFO - AI System says the project `sinaptik-ai/pandas-ai` uses AI.
2025-01-11 11:51:54,782 - AIAct - INFO - Full reasoning: The repository "PandasAI" appears to use AI as it is described as a platform that facilitates interaction with data using natural language. This implies the usage of natural language processing (NLP) techniques, a subset of AI, to interpret and process user queries.
In [ ]: