<repository_structure>
<directory name="async_llm_handler">
    <file>
        <name>.env</name>
        <path>.env</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>.gitignore</name>
        <path>.gitignore</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>pyproject.toml</name>
        <path>pyproject.toml</path>
        <content>
# File: async_llm_handler/pyproject.toml

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "async_llm_handler"
version = "0.1.0"
description = "An asynchronous handler for multiple LLM APIs"
readme = "README.md"
requires-python = ">=3.7"
license = "MIT"
keywords = ["llm", "api", "async", "nlp"]
authors = [
  { name = "Your Name", email = "your.email@example.com" },
]
dependencies = [
  "anthropic",
  "cohere",
  "google-generativeai",
  "groq",
  "openai",
  "python-dotenv",
  "tiktoken",
  "asyncio",
]

[project.optional-dependencies]
dev = [
  "pytest",
  "pytest-asyncio",
]

[project.urls]
Homepage = "https://github.com/yourusername/async_llm_handler"
Repository = "https://github.com/yourusername/async_llm_handler.git"
"Bug Tracker" = "https://github.com/yourusername/async_llm_handler/issues"

[tool.pytest.ini_options]
asyncio_mode = "auto"

[tool.hatch.build.targets.wheel]
packages = ["async_llm_handler"]
        </content>
    </file>
    <file>
        <name>README.md</name>
        <path>README.md</path>
        <content>
# Async LLM Handler

Async LLM Handler is a Python package that provides a unified interface for interacting with multiple Language Model APIs asynchronously. It supports Gemini, Claude, OpenAI, Cohere, and Llama (via Groq) APIs.

## Features

- Asynchronous API calls
- Automatic rate limiting
- Easy switching between different LLM providers
- Fallback mechanism when using multiple APIs
- Token counting and prompt clipping utilities

## Installation

You can install the Async LLM Handler using pip:

```bash
pip install async-llm-handler
```

## Usage

First, set up your environment variables in a `.env` file:

```
GEMINI_API_KEY=your_gemini_api_key
CLAUDE_API_KEY=your_claude_api_key
OPENAI_API_KEY=your_openai_api_key
COHERE_API_KEY=your_cohere_api_key
GROQ_API_KEY=your_groq_api_key
```

Then, you can use the package as follows:

### Synchronous Usage

```python
from async_llm_handler import LLMHandler

handler = LLMHandler()
response = handler.query("What is the meaning of life?")
print(response)
```

### Asynchronous Usage

```python
import asyncio
from async_llm_handler import LLMHandler

async def main():
    handler = LLMHandler()
    response = await handler._async_query("What is the meaning of life?")
    print(response)

asyncio.run(main())
```

## Advanced Usage

You can specify a particular model to use:

```python
response = handler.query("Tell me a joke", model="openai")
```

Or use multiple models concurrently:

```python
import asyncio
from async_llm_handler import LLMHandler

async def main():
    handler = LLMHandler()
    prompt = "What is the best programming language?"
    
    tasks = [
        handler._async_query(prompt, model='gemini'),
        handler._async_query(prompt, model='openai'),
        handler._async_query(prompt, model='claude')
    ]
    
    responses = await asyncio.gather(*tasks)
    
    for i, response in enumerate(responses):
        print(f"Response from model {i+1}: {response}")

asyncio.run(main())
```

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request.

## License

This project is licensed under the MIT License.

        </content>
    </file>
</directory>
    <directory name=".pytest_cache">
    <file>
        <name>.gitignore</name>
        <path>.pytest_cache\.gitignore</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>CACHEDIR.TAG</name>
        <path>.pytest_cache\CACHEDIR.TAG</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>README.md</name>
        <path>.pytest_cache\README.md</path>
        <content>
# pytest cache directory #

This directory contains data from the pytest's cache plugin,
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.

**Do not** commit this to version control.

See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

        </content>
    </file>
    </directory>
        <directory name="v">
        </directory>
            <directory name="cache">
    <file>
        <name>lastfailed</name>
        <path>.pytest_cache\v\cache\lastfailed</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>nodeids</name>
        <path>.pytest_cache\v\cache\nodeids</path>
        <content>Full content not provided</content>
    </file>
    <file>
        <name>stepwise</name>
        <path>.pytest_cache\v\cache\stepwise</path>
        <content>Full content not provided</content>
    </file>
            </directory>
    <directory name="async_llm_handler">
    <file>
        <name>config.py</name>
        <path>async_llm_handler\config.py</path>
        <content>
# File: async_llm_handler/config.py

import os
from dotenv import load_dotenv

load_dotenv()

class Config:
    def __init__(self):
        self.gemini_api_key = os.getenv("GEMINI_API_KEY")
        self.claude_api_key = os.getenv("CLAUDE_API_KEY")
        self.openai_api_key = os.getenv("OPENAI_API_KEY")
        self.cohere_api_key = os.getenv("COHERE_API_KEY")
        self.groq_api_key = os.getenv("GROQ_API_KEY")

    def __getitem__(self, key):
        return getattr(self, key)
        </content>
    </file>
    <file>
        <name>exceptions.py</name>
        <path>async_llm_handler\exceptions.py</path>
        <content>
# File: async_llm_handler/exceptions.py

class LLMAPIError(Exception):
    """Exception raised for errors in the LLM API."""
    pass
        </content>
    </file>
    <file>
        <name>handler.py</name>
        <path>async_llm_handler\handler.py</path>
        <content>
# File: async_llm_handler/handler.py

import asyncio
from typing import Optional, Union
from concurrent.futures import ThreadPoolExecutor
import anthropic
import google.generativeai as genai
from openai import AsyncOpenAI

from .config import Config
from .exceptions import LLMAPIError
from .utils.rate_limiter import RateLimiter
from .utils.token_utils import clip_prompt
from .utils.logger import get_logger

logger = get_logger(__name__)

from typing import Union, Optional, Coroutine, Any

class LLMHandler:
    def __init__(self, config: Optional[Config] = None):
        self.config = config or Config()
        self._setup_clients()
        self._setup_rate_limiters()
        self._executor = ThreadPoolExecutor()

    def _setup_clients(self):
        genai.configure(api_key=self.config.gemini_api_key)
        self.gemini_client = genai.GenerativeModel(
            "gemini-1.5-flash-latest",
            safety_settings=[
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
            ],
            generation_config={"response_mime_type": "application/json"},
        )
        self.claude_client = anthropic.Anthropic(api_key=self.config.claude_api_key)
        self.openai_client = AsyncOpenAI(api_key=self.config.openai_api_key)

    def _setup_rate_limiters(self):
        self.rate_limiters = {
            'gemini_flash': RateLimiter(30, 60),
            'claude_3_5_sonnet': RateLimiter(5, 60),
            'claude_3_haiku': RateLimiter(5, 60),
            'gpt_4o': RateLimiter(5, 60),
            'gpt_4o_mini': RateLimiter(5, 60)
        }

    def query(self, prompt: str, model: str, sync: bool = True, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> Union[str, Coroutine[Any, Any, str]]:
        if sync:
            return self._sync_query(prompt, model, max_input_tokens, max_output_tokens)
        else:
            return self._async_query(prompt, model, max_input_tokens, max_output_tokens)

    def _sync_query(self, prompt: str, model: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        method = getattr(self, f'_query_{model}_sync', None)
        if not method:
            raise ValueError(f"Unsupported model for sync query: {model}")
        
        return method(prompt, max_input_tokens, max_output_tokens)

    async def _async_query(self, prompt: str, model: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        method = getattr(self, f'_query_{model}_async', None)
        if not method:
            raise ValueError(f"Unsupported model for async query: {model}")
        
        return await method(prompt, max_input_tokens, max_output_tokens)

    def _query_gemini_flash_sync(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        self.rate_limiters['gemini_flash'].acquire()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            logger.info("Generating content with Gemini Flash API (Sync).")
            params = {'max_output_tokens': max_output_tokens} if max_output_tokens is not None else {}
            response = self.gemini_client.generate_content(prompt, **params)
            if response.candidates:
                return response.candidates[0].content.parts[0].text
            else:
                raise ValueError("Invalid response format from Gemini Flash API.")
        except Exception as e:
            logger.error(f"Error with Gemini Flash API: {e}")
            raise LLMAPIError(f"Gemini Flash API error: {str(e)}")
        finally:
            self.rate_limiters['gemini_flash'].release()

    async def _query_gemini_flash_async(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        await self.rate_limiters['gemini_flash'].acquire_async()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            logger.info("Generating content with Gemini Flash API (Async).")
            params = {'max_output_tokens': max_output_tokens} if max_output_tokens is not None else {}
            response = await self.gemini_client.generate_content_async(prompt, **params)
            if response.candidates:
                return response.candidates[0].content.parts[0].text
            else:
                raise ValueError("Invalid response format from Gemini Flash API.")
        except Exception as e:
            logger.error(f"Error with Gemini Flash API: {e}")
            raise LLMAPIError(f"Gemini Flash API error: {str(e)}")
        finally:
            self.rate_limiters['gemini_flash'].release()

    async def _query_gpt_4o_async(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        await self.rate_limiters['gpt_4o'].acquire_async()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            messages = [{"role": "user", "content": prompt}]
            params = {
                "model": "gpt-4o-2024-05-13",
                "messages": messages,
                "temperature": 0.3,
                "top_p": 1,
                "frequency_penalty": 0,
                "presence_penalty": 0,
            }
            if max_output_tokens is not None:
                params["max_tokens"] = max_output_tokens
            response = await self.openai_client.chat.completions.create(**params)
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Error with GPT-4o API: {e}")
            raise LLMAPIError(f"GPT-4o API error: {str(e)}")
        finally:
            self.rate_limiters['gpt_4o'].release()

    def _query_gpt_4o_sync(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(self._query_gpt_4o_async(prompt, max_input_tokens, max_output_tokens))
        finally:
            loop.close()

    async def _query_gpt_4o_mini_async(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        await self.rate_limiters['gpt_4o_mini'].acquire_async()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            messages = [{"role": "user", "content": prompt}]
            params = {
                "model": "gpt-4o-mini-2024-07-18",
                "messages": messages,
                "temperature": 0.3,
                "top_p": 1,
                "frequency_penalty": 0,
                "presence_penalty": 0,
            }
            if max_output_tokens is not None:
                params["max_tokens"] = max_output_tokens
            response = await self.openai_client.chat.completions.create(**params)
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Error with GPT-4o mini API: {e}")
            raise LLMAPIError(f"GPT-4o mini API error: {str(e)}")
        finally:
            self.rate_limiters['gpt_4o_mini'].release()

    def _query_gpt_4o_mini_sync(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(self._query_gpt_4o_mini_async(prompt, max_input_tokens, max_output_tokens))
        finally:
            loop.close()

    async def _query_claude_3_5_sonnet_async(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        await self.rate_limiters['claude_3_5_sonnet'].acquire_async()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            params = {
                "model": "claude-3-5-sonnet-20240620",
                "messages": [{"role": "user", "content": prompt}],
                "system": "Directly fulfill the user's request without preamble, paying very close attention to all nuances of their instructions.",
                "max_tokens": max_output_tokens if max_output_tokens is not None else 4096,
            }
            response = await asyncio.to_thread(self.claude_client.messages.create, **params)
            return response.content[0].text
        except Exception as e:
            logger.error(f"Error with Claude 3.5 Sonnet API: {e}")
            raise LLMAPIError(f"Claude 3.5 Sonnet API error: {str(e)}")
        finally:
            self.rate_limiters['claude_3_5_sonnet'].release()

    def _query_claude_3_5_sonnet_sync(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(self._query_claude_3_5_sonnet_async(prompt, max_input_tokens, max_output_tokens))
        finally:
            loop.close()

    async def _query_claude_3_haiku_async(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        await self.rate_limiters['claude_3_haiku'].acquire_async()
        try:
            if max_input_tokens:
                prompt = clip_prompt(prompt, max_input_tokens)
            params = {
                "model": "claude-3-haiku-20240307",
                "messages": [{"role": "user", "content": prompt}],
                "system": "Directly fulfill the user's request without preamble, paying very close attention to all nuances of their instructions.",
                "max_tokens": max_output_tokens if max_output_tokens is not None else 4096,
            }
            response = await asyncio.to_thread(self.claude_client.messages.create, **params)
            return response.content[0].text
        except Exception as e:
            logger.error(f"Error with Claude 3 Haiku API: {e}")
            raise LLMAPIError(f"Claude 3 Haiku API error: {str(e)}")
        finally:
            self.rate_limiters['claude_3_haiku'].release()

    def _query_claude_3_haiku_sync(self, prompt: str, max_input_tokens: Optional[int] = None, max_output_tokens: Optional[int] = None) -> str:
        loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(self._query_claude_3_haiku_async(prompt, max_input_tokens, max_output_tokens))
        finally:
            loop.close()
        </content>
    </file>
    <file>
        <name>repo_context_extractor.py</name>
        <path>async_llm_handler\repo_context_extractor.py</path>
        <content>
import os

EXCLUDED_DIRS = {".git", "__pycache__", "node_modules", ".venv"}
FULL_CONTENT_EXTENSIONS = {".py", ".txt", ".dbml", ".yaml", ".toml", ".md"}

def create_file_element(file_path, root_folder):
    relative_path = os.path.relpath(file_path, root_folder)
    file_name = os.path.basename(file_path)
    file_extension = os.path.splitext(file_name)[1]

    file_element = [
        f"    <file>\n        <name>{file_name}</name>\n        <path>{relative_path}</path>\n"
    ]

    if file_extension in FULL_CONTENT_EXTENSIONS:
        file_element.append("        <content>\n")
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                file_element.append(file.read())
        except UnicodeDecodeError:
            file_element.append("Binary or non-UTF-8 content not displayed")
        file_element.append("\n        </content>\n")
    else:
        file_element.append("        <content>Full content not provided</content>\n")

    file_element.append("    </file>\n")
    return "".join(file_element)

def get_repo_structure(root_folder):
    structure = ["<repository_structure>\n"]

    for subdir, dirs, files in os.walk(root_folder):
        dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
        level = subdir.replace(root_folder, "").count(os.sep)
        indent = " " * 4 * level
        relative_subdir = os.path.relpath(subdir, root_folder)

        structure.append(f'{indent}<directory name="{os.path.basename(subdir)}">\n')
        for file in files:
            file_path = os.path.join(subdir, file)
            file_element = create_file_element(file_path, root_folder)
            structure.append(file_element)
        structure.append(f"{indent}</directory>\n")

    structure.append("</repository_structure>\n")
    return "".join(structure)

def main():
    root_folder = r"C:\Users\bnsoh2\OneDrive - University of Nebraska-Lincoln\Documents\Projects\async_llm_handler"
    output_file = os.path.join(root_folder, "repository_context.txt")

    # Delete the previous output file if it exists
    if os.path.exists(output_file):
        os.remove(output_file)
        print(f"Deleted previous {output_file}")

    repo_structure = get_repo_structure(root_folder)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(repo_structure)

    print(f"Fresh repository context has been extracted to {output_file}")

if __name__ == "__main__":
    main()
        </content>
    </file>
    <file>
        <name>__init__.py</name>
        <path>async_llm_handler\__init__.py</path>
        <content>
# File: async_llm_handler/__init__.py

from async_llm_handler.handler import LLMHandler
from async_llm_handler.config import Config

__all__ = ['LLMHandler', 'Config']
__version__ = "0.1.0"
        </content>
    </file>
    </directory>
        <directory name="examples">
    <file>
        <name>async_example.py</name>
        <path>async_llm_handler\examples\async_example.py</path>
        <content>
# File: async_llm_handler/examples/async_example.py

import asyncio
from async_llm_handler import LLMHandler

async def main():
    handler = LLMHandler()
    
    prompt = "What is the meaning of life?"

    # Using specific models
    models = ['gemini_flash', 'gpt_4o', 'gpt_4o_mini', 'claude_3_5_sonnet', 'claude_3_haiku']
    tasks = [handler.query(prompt, model=model, sync=False) for model in models]
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    
    for model, response in zip(models, responses):
        if isinstance(response, Exception):
            print(f"Error with {model}: {str(response)}\n")
        else:
            print(f"{model.replace('_', ' ').title()} Response: {response}\n")

    # Example with max_input_tokens and max_output_tokens
    limited_prompt = "Summarize the entire history of human civilization in great detail."
    try:
        response = await handler.query(limited_prompt, model='gpt_4o', sync=False, max_input_tokens=1000, max_output_tokens=100)
        print(f"GPT-4o Response (limited input to 1000 tokens, output to 100 tokens): {response}\n")
    except Exception as e:
        print(f"Error with GPT-4o (limited tokens): {str(e)}\n")

if __name__ == "__main__":
    asyncio.run(main())
        </content>
    </file>
    <file>
        <name>sync_example.py</name>
        <path>async_llm_handler\examples\sync_example.py</path>
        <content>
# File: async_llm_handler/examples/sync_example.py

from async_llm_handler import LLMHandler

def main():
    handler = LLMHandler()
    
    prompt = "What is the meaning of life?"

    # Using specific models
    models = ['gemini_flash', 'gpt_4o', 'gpt_4o_mini', 'claude_3_5_sonnet', 'claude_3_haiku']
    for model in models:
        try:
            response = handler.query(prompt, model=model, sync=True)
            print(f"{model.replace('_', ' ').title()} Response: {response}\n")
        except Exception as e:
            print(f"Error with {model}: {str(e)}\n")

    # Example with max_input_tokens and max_output_tokens
    limited_prompt = "Summarize the entire history of human civilization in great detail."
    try:
        response = handler.query(limited_prompt, model='gpt_4o', sync=True, max_input_tokens=1000, max_output_tokens=100)
        print(f"GPT-4o Response (limited input to 1000 tokens, output to 100 tokens): {response}\n")
    except Exception as e:
        print(f"Error with GPT-4o (limited tokens): {str(e)}\n")

if __name__ == "__main__":
    main()
        </content>
    </file>
        </directory>
        <directory name="tests">
    <file>
        <name>test_handler.py</name>
        <path>async_llm_handler\tests\test_handler.py</path>
        <content>
# File: async_llm_handler/tests/test_handler.py

import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))

import pytest
from async_llm_handler import LLMHandler, Config
from async_llm_handler.exceptions import LLMAPIError

@pytest.fixture
def handler():
    return LLMHandler()

def test_query(handler):
    response = handler.query("Test prompt")
    assert isinstance(response, str)
    assert len(response) > 0

@pytest.mark.asyncio
async def test_async_query(handler):
    response = await handler._async_query("Test prompt")
    assert isinstance(response, str)
    assert len(response) > 0

def test_invalid_model(handler):
    with pytest.raises(ValueError):
        handler.query("Test prompt", model="invalid_model")

@pytest.mark.asyncio
async def test_all_apis_fail(monkeypatch):
    def mock_api_error(*args, **kwargs):
        raise LLMAPIError("API Error")

    handler = LLMHandler()
    for model in ['gemini', 'cohere', 'llama', 'claude', 'openai']:
        monkeypatch.setattr(handler, f'_query_{model}', mock_api_error)

    with pytest.raises(LLMAPIError, match="All LLM APIs failed to respond"):
        await handler._async_query("Test prompt")
        </content>
    </file>
    <file>
        <name>test_utils.py</name>
        <path>async_llm_handler\tests\test_utils.py</path>
        <content>
# File: async_llm_handler/tests/test_utils.py

import pytest
from async_llm_handler.utils import count_tokens, clip_prompt, RateLimiter

def test_count_tokens():
    text = "Hello, world!"
    assert count_tokens(text) > 0

def test_clip_prompt():
    long_prompt = "This is a very long prompt " * 100
    max_tokens = 10
    clipped = clip_prompt(long_prompt, max_tokens)
    assert count_tokens(clipped) <= max_tokens

@pytest.mark.asyncio
async def test_rate_limiter():
    limiter = RateLimiter(rate=2, period=1)
    
    start_time = pytest.helpers.time()
    
    async with limiter:
        pass
    async with limiter:
        pass
    
    # This should wait
    async with limiter:
        pass
    
    end_time = pytest.helpers.time()
    
    assert end_time - start_time >= 1.0

def test_logger():
    from async_llm_handler.utils import get_logger
    logger = get_logger("test_logger")
    assert logger.name == "test_logger"
    assert logger.level == 20  # INFO level
        </content>
    </file>
    <file>
        <name>__init__.py</name>
        <path>async_llm_handler\tests\__init__.py</path>
        <content>
# File: async_llm_handler/tests/__init__.py
# This file can be left empty
        </content>
    </file>
        </directory>
        <directory name="utils">
    <file>
        <name>logger.py</name>
        <path>async_llm_handler\utils\logger.py</path>
        <content>
# File: async_llm_handler/utils/logger.py

import logging

def get_logger(name):
    logger = logging.getLogger(name)
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
    return logger
        </content>
    </file>
    <file>
        <name>rate_limiter.py</name>
        <path>async_llm_handler\utils\rate_limiter.py</path>
        <content>
# File: async_llm_handler/utils/rate_limiter.py

import asyncio
import time

class RateLimiter:
    def __init__(self, rate: int, period: int = 60):
        self.rate = rate
        self.period = period
        self.allowance = rate
        self.last_check = time.monotonic()
        self._lock = asyncio.Lock()

    def acquire(self):
        current = time.monotonic()
        time_passed = current - self.last_check
        self.last_check = current
        self.allowance += time_passed * (self.rate / self.period)
        if self.allowance > self.rate:
            self.allowance = self.rate
        if self.allowance < 1:
            time.sleep((1 - self.allowance) / (self.rate / self.period))
            self.allowance = 0
        else:
            self.allowance -= 1

    async def acquire_async(self):
        async with self._lock:
            current = time.monotonic()
            time_passed = current - self.last_check
            self.last_check = current
            self.allowance += time_passed * (self.rate / self.period)
            if self.allowance > self.rate:
                self.allowance = self.rate
            if self.allowance < 1:
                await asyncio.sleep((1 - self.allowance) / (self.rate / self.period))
                self.allowance = 0
            else:
                self.allowance -= 1

    def release(self):
        pass  # No action needed for release in this implementation
        </content>
    </file>
    <file>
        <name>token_utils.py</name>
        <path>async_llm_handler\utils\token_utils.py</path>
        <content>
# File: async_llm_handler/utils/token_utils.py

import tiktoken

def count_tokens(text, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens

def clip_prompt(prompt, max_tokens, encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(prompt)
    if len(tokens) > max_tokens:
        clipped_tokens = tokens[:max_tokens]
        clipped_prompt = encoding.decode(clipped_tokens)
        return clipped_prompt
    return prompt
        </content>
    </file>
    <file>
        <name>__init__.py</name>
        <path>async_llm_handler\utils\__init__.py</path>
        <content>
# File: async_llm_handler/utils/__init__.py

from .logger import get_logger
from .rate_limiter import RateLimiter
from .token_utils import count_tokens, clip_prompt

__all__ = ['get_logger', 'RateLimiter', 'count_tokens', 'clip_prompt']
        </content>
    </file>
        </directory>
</repository_structure>
