This commit is contained in:
2025-08-29 21:33:33 +02:00
parent df4eeca9cb
commit 2b8271263d
36 changed files with 1439 additions and 0 deletions

174
src/main.py Normal file
View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
"""
CensorBot - Data Sanitization Tool
A NiceGUI-based application for removing sensitive customer information from text
"""
import asyncio
import os
import random
from typing import List
from dotenv import load_dotenv
from nicegui import ui
from lib import get_response, LLMBackend, LLMMessage
load_dotenv()
def get_random_example_text() -> str:
examples_dir = "examples"
# Get all .txt files
txt_files = [f for f in os.listdir(examples_dir) if f.endswith('.txt')]
if not txt_files:
raise FileNotFoundError("No .txt files found in examples directory")
# Pick random file
random_file = random.choice(txt_files)
file_path = os.path.join(examples_dir, random_file)
# Read and return content
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
async def main():
input_text: ui.textarea
output_text: ui.textarea
prompt: str
with open('src/prompt.md') as prompt_file:
prompt = prompt_file.read()
backend: LLMBackend = {'base_url': os.environ['BACKEND_BASE_URL'],
'api_token': os.environ['BACKEND_API_TOKEN'],
'model': os.environ['BACKEND_MODEL']}
async def censor_input():
messages: List[LLMMessage] = [
{'role': 'system', 'content': prompt},
{'role': 'user', 'content': input_text.value}
]
try:
# Stream the response with cancellation support
async for chunk in get_response(backend, messages, True): # type: ignore
# Check if task was cancelled
current_task = asyncio.current_task()
if current_task and current_task.cancelled():
break
if 'content' in chunk:
output_text.value += chunk['content']
print(chunk['content'])
# Small delay to allow UI updates and cancellation checks
await asyncio.sleep(0.01)
except asyncio.CancelledError:
ui.notify('Generation stopped by user', type='info')
# Save whatever content we have so far
return
# Application header
with ui.header(elevated=True).classes('q-pa-md'):
ui.label('🔒 CensorBot').classes('text-h4 text-weight-bold')
ui.label('Secure Data Sanitization for IT Service Companies').classes('text-subtitle1 text-grey-7')
# Main container
with ui.column().classes('w-full max-w-6xl mx-auto q-pa-lg q-gutter-md'):
# Input section
with ui.card().classes('w-full'):
ui.label('Original Text').classes('text-h6 text-weight-medium')
ui.label('Contains sensitive customer information').classes('text-caption text-grey-7')
input_text = ui.textarea(
placeholder='Paste your text here...\n\n'
'Example:\n'
'Customer John Smith called from 555-1234 about issue with account john@example.com',
value=get_random_example_text()
).classes('w-full').style('font-family: monospace').props('autogrow')
# Character count
char_count_label = ui.label('0 characters').classes('text-caption text-grey-6')
# Output section
with ui.card().classes('w-full'):
ui.label('Censored Text').classes('text-h6 text-weight-medium')
ui.label('Safe to use with external LLMs').classes('text-caption text-green-7')
output_text = ui.textarea(
placeholder='Censored text will appear here...\n\n'
'Example:\n'
'Customer [CUSTOMER_NAME] called from [PHONE_NUMBER] about issue with account [EMAIL]',
value=''
).classes('w-full').style('font-family: monospace; background-color: #f5f5f5').props('readonly autogrow')
# Copy button
with ui.row().classes('w-full justify-end q-gutter-sm'):
copy_button = ui.button('Copy to Clipboard', icon='content_copy').props('outline')
copy_button.disable()
# Action buttons
with ui.card().classes('w-full'):
with ui.row().classes('w-full justify-center q-gutter-md'):
clear_button = ui.button('Clear All', icon='clear').props('outline color=negative')
process_button = ui.button('Censor Data', icon='shield', on_click=censor_input).props('color=primary size=lg')
# Statistics section
with ui.expansion('Processing Statistics', icon='analytics').classes('w-full'):
with ui.row().classes('w-full q-gutter-md'):
with ui.column().classes('col'):
ui.label('Items Censored').classes('text-weight-medium')
stats_censored = ui.label('0').classes('text-h4 text-primary')
with ui.column().classes('col'):
ui.label('Processing Time').classes('text-weight-medium')
stats_time = ui.label('0.0s').classes('text-h4 text-primary')
with ui.column().classes('col'):
ui.label('Data Reduction').classes('text-weight-medium')
stats_reduction = ui.label('0%').classes('text-h4 text-primary')
# Event handlers (mockup only - no real functionality)
def update_char_count():
char_count_label.text = f'{len(input_text.value)} characters'
def mock_copy():
ui.notify('Text copied to clipboard (mockup)', type='positive')
def clear_all():
input_text.value = ''
output_text.value = ''
copy_button.disable()
stats_censored.text = '0'
stats_time.text = '0.0s'
stats_reduction.text = '0%'
update_char_count()
# Connect event handlers
input_text.on('input', update_char_count)
copy_button.on_click(mock_copy)
clear_button.on_click(clear_all)
# Footer
with ui.footer().classes('q-pa-md text-center'):
ui.label('CensorBot - Protecting Customer Privacy').classes('text-caption text-grey-6')
ui.label('⚠️ This is a mockup - no actual processing implemented yet').classes('text-caption text-orange')
# Run the application
if __name__ in {"__main__", "__mp_main__"}:
@ui.page('/')
async def _():
await main()
ui.run(
title='CensorBot - Data Sanitization Tool',
favicon='🔒',
show=False,
dark=False,
port=8080
)

43
src/prompt.md Normal file
View File

@@ -0,0 +1,43 @@
# Data Censoring Instructions
You are a data sanitization assistant. Your sole purpose is to identify and replace sensitive customer information with appropriate placeholders while maintaining the context and meaning of the text.
## What to Censor
Replace the following types of sensitive information:
1. **Personal Names**: Replace with `[NAME]` or `[CUSTOMER_NAME]`
2. **Email Addresses**: Replace with `[EMAIL]`
3. **Phone Numbers**: Replace with `[PHONE]`
4. **Physical Addresses**: Replace with `[ADDRESS]`
5. **Social Security Numbers**: Replace with `[SSN]`
6. **Credit Card Numbers**: Replace with `[CREDIT_CARD]`
7. **Bank Account Numbers**: Replace with `[ACCOUNT_NUMBER]`
8. **Driver's License Numbers**: Replace with `[LICENSE]`
9. **Passport Numbers**: Replace with `[PASSPORT]`
10. **Medical Record Numbers**: Replace with `[MRN]`
11. **IP Addresses**: Replace with `[IP_ADDRESS]`
12. **Usernames/User IDs**: Replace with `[USERNAME]`
13. **Passwords**: Replace with `[PASSWORD]`
14. **Company Names** (when context indicates it's customer data): Replace with `[COMPANY]`
15. **Dates of Birth**: Replace with `[DOB]`
## Rules
1. **Preserve Context**: Keep all non-sensitive text exactly as provided
2. **Maintain Structure**: Preserve formatting, punctuation, and spacing
3. **Be Consistent**: Use the same placeholder for the same entity throughout the text
4. **No Commentary**: Output ONLY the censored text, no explanations or additional text
5. **When in Doubt**: If something might be sensitive, censor it
## Example
Input:
"John Smith from Acme Corp called at 555-1234 about his account john.smith@acme.com. His credit card ending in 4567 was declined."
Output:
"[CUSTOMER_NAME] from [COMPANY] called at [PHONE] about his account [EMAIL]. His credit card ending in [CREDIT_CARD] was declined."
## Your Task
Censor the following text by replacing all sensitive information with appropriate placeholders. Output only the censored version: