init

2025-08-29 21:33:33 +02:00
parent df4eeca9cb
commit 2b8271263d
36 changed files with 1439 additions and 0 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""
+CensorBot - Data Sanitization Tool
+A NiceGUI-based application for removing sensitive customer information from text
+"""
+import asyncio
+import os
+import random
+from typing import List
+from dotenv import load_dotenv
+
+from nicegui import ui
+
+from lib import get_response, LLMBackend, LLMMessage
+load_dotenv()
+
+
+def get_random_example_text() -> str:
+    examples_dir = "examples"
+
+    # Get all .txt files
+    txt_files = [f for f in os.listdir(examples_dir) if f.endswith('.txt')]
+
+    if not txt_files:
+        raise FileNotFoundError("No .txt files found in examples directory")
+
+    # Pick random file
+    random_file = random.choice(txt_files)
+    file_path = os.path.join(examples_dir, random_file)
+
+    # Read and return content
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
+
+
+async def main():
+    input_text: ui.textarea
+    output_text: ui.textarea
+
+    prompt: str
+
+    with open('src/prompt.md') as prompt_file:
+        prompt = prompt_file.read()
+
+    backend: LLMBackend = {'base_url': os.environ['BACKEND_BASE_URL'],
+                           'api_token': os.environ['BACKEND_API_TOKEN'],
+                           'model': os.environ['BACKEND_MODEL']}
+
+    async def censor_input():
+        messages: List[LLMMessage] = [
+            {'role': 'system', 'content': prompt},
+            {'role': 'user', 'content': input_text.value}
+        ]
+        try:
+            # Stream the response with cancellation support
+            async for chunk in get_response(backend, messages, True):  # type: ignore
+                # Check if task was cancelled
+                current_task = asyncio.current_task()
+                if current_task and current_task.cancelled():
+                    break
+
+                if 'content' in chunk:
+                    output_text.value += chunk['content']
+                    print(chunk['content'])
+
+                # Small delay to allow UI updates and cancellation checks
+                await asyncio.sleep(0.01)
+
+        except asyncio.CancelledError:
+            ui.notify('Generation stopped by user', type='info')
+            # Save whatever content we have so far
+            return
+
+    # Application header
+    with ui.header(elevated=True).classes('q-pa-md'):
+        ui.label('🔒 CensorBot').classes('text-h4 text-weight-bold')
+        ui.label('Secure Data Sanitization for IT Service Companies').classes('text-subtitle1 text-grey-7')
+
+    # Main container
+    with ui.column().classes('w-full max-w-6xl mx-auto q-pa-lg q-gutter-md'):
+
+        # Input section
+        with ui.card().classes('w-full'):
+            ui.label('Original Text').classes('text-h6 text-weight-medium')
+            ui.label('Contains sensitive customer information').classes('text-caption text-grey-7')
+
+            input_text = ui.textarea(
+                placeholder='Paste your text here...\n\n'
+                'Example:\n'
+                'Customer John Smith called from 555-1234 about issue with account john@example.com',
+                value=get_random_example_text()
+            ).classes('w-full').style('font-family: monospace').props('autogrow')
+
+            # Character count
+            char_count_label = ui.label('0 characters').classes('text-caption text-grey-6')
+
+        # Output section
+        with ui.card().classes('w-full'):
+            ui.label('Censored Text').classes('text-h6 text-weight-medium')
+            ui.label('Safe to use with external LLMs').classes('text-caption text-green-7')
+
+            output_text = ui.textarea(
+                placeholder='Censored text will appear here...\n\n'
+                'Example:\n'
+                'Customer [CUSTOMER_NAME] called from [PHONE_NUMBER] about issue with account [EMAIL]',
+                value=''
+            ).classes('w-full').style('font-family: monospace; background-color: #f5f5f5').props('readonly autogrow')
+
+            # Copy button
+            with ui.row().classes('w-full justify-end q-gutter-sm'):
+                copy_button = ui.button('Copy to Clipboard', icon='content_copy').props('outline')
+                copy_button.disable()
+
+        # Action buttons
+        with ui.card().classes('w-full'):
+            with ui.row().classes('w-full justify-center q-gutter-md'):
+                clear_button = ui.button('Clear All', icon='clear').props('outline color=negative')
+                process_button = ui.button('Censor Data', icon='shield', on_click=censor_input).props('color=primary size=lg')
+
+        # Statistics section
+        with ui.expansion('Processing Statistics', icon='analytics').classes('w-full'):
+            with ui.row().classes('w-full q-gutter-md'):
+                with ui.column().classes('col'):
+                    ui.label('Items Censored').classes('text-weight-medium')
+                    stats_censored = ui.label('0').classes('text-h4 text-primary')
+
+                with ui.column().classes('col'):
+                    ui.label('Processing Time').classes('text-weight-medium')
+                    stats_time = ui.label('0.0s').classes('text-h4 text-primary')
+
+                with ui.column().classes('col'):
+                    ui.label('Data Reduction').classes('text-weight-medium')
+                    stats_reduction = ui.label('0%').classes('text-h4 text-primary')
+
+        # Event handlers (mockup only - no real functionality)
+        def update_char_count():
+            char_count_label.text = f'{len(input_text.value)} characters'
+
+        def mock_copy():
+            ui.notify('Text copied to clipboard (mockup)', type='positive')
+
+        def clear_all():
+            input_text.value = ''
+            output_text.value = ''
+            copy_button.disable()
+            stats_censored.text = '0'
+            stats_time.text = '0.0s'
+            stats_reduction.text = '0%'
+            update_char_count()
+
+        # Connect event handlers
+        input_text.on('input', update_char_count)
+        copy_button.on_click(mock_copy)
+        clear_button.on_click(clear_all)
+
+    # Footer
+    with ui.footer().classes('q-pa-md text-center'):
+        ui.label('CensorBot - Protecting Customer Privacy').classes('text-caption text-grey-6')
+        ui.label('⚠️ This is a mockup - no actual processing implemented yet').classes('text-caption text-orange')
+
+
+# Run the application
+if __name__ in {"__main__", "__mp_main__"}:
+    @ui.page('/')
+    async def _():
+        await main()
+
+    ui.run(
+        title='CensorBot - Data Sanitization Tool',
+        favicon='🔒',
+        show=False,
+        dark=False,
+        port=8080
+    )
--- a/src/prompt.md
+++ b/src/prompt.md
@@ -0,0 +1,43 @@
+# Data Censoring Instructions
+
+You are a data sanitization assistant. Your sole purpose is to identify and replace sensitive customer information with appropriate placeholders while maintaining the context and meaning of the text.
+
+## What to Censor
+
+Replace the following types of sensitive information:
+
+1. **Personal Names**: Replace with `[NAME]` or `[CUSTOMER_NAME]`
+2. **Email Addresses**: Replace with `[EMAIL]`
+3. **Phone Numbers**: Replace with `[PHONE]`
+4. **Physical Addresses**: Replace with `[ADDRESS]`
+5. **Social Security Numbers**: Replace with `[SSN]`
+6. **Credit Card Numbers**: Replace with `[CREDIT_CARD]`
+7. **Bank Account Numbers**: Replace with `[ACCOUNT_NUMBER]`
+8. **Driver's License Numbers**: Replace with `[LICENSE]`
+9. **Passport Numbers**: Replace with `[PASSPORT]`
+10. **Medical Record Numbers**: Replace with `[MRN]`
+11. **IP Addresses**: Replace with `[IP_ADDRESS]`
+12. **Usernames/User IDs**: Replace with `[USERNAME]`
+13. **Passwords**: Replace with `[PASSWORD]`
+14. **Company Names** (when context indicates it's customer data): Replace with `[COMPANY]`
+15. **Dates of Birth**: Replace with `[DOB]`
+
+## Rules
+
+1. **Preserve Context**: Keep all non-sensitive text exactly as provided
+2. **Maintain Structure**: Preserve formatting, punctuation, and spacing
+3. **Be Consistent**: Use the same placeholder for the same entity throughout the text
+4. **No Commentary**: Output ONLY the censored text, no explanations or additional text
+5. **When in Doubt**: If something might be sensitive, censor it
+
+## Example
+
+Input:
+"John Smith from Acme Corp called at 555-1234 about his account john.smith@acme.com. His credit card ending in 4567 was declined."
+
+Output:
+"[CUSTOMER_NAME] from [COMPANY] called at [PHONE] about his account [EMAIL]. His credit card ending in [CREDIT_CARD] was declined."
+
+## Your Task
+
+Censor the following text by replacing all sensitive information with appropriate placeholders. Output only the censored version: