Sure! Pl

Browse files

Files changed (9) hide show

api_service.py +614 -0
data/.gitkeep +2 -0
evaluation/.gitkeep +2 -0
models/.gitkeep +2 -0
raw_dataset.json +0 -0
requirements.txt +12 -0
setup-guide.md +342 -0
test_api.py +160 -0
training_pipeline.py +772 -0

api_service.py ADDED Viewed

	@@ -0,0 +1,614 @@

+#!/usr/bin/env python3
+"""
+Backend Code Generation API Service
+===================================
+Production-ready API service for serving the trained backend code generation model.
+Provides RESTful endpoints for generating complete backend applications.
+"""
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, FileResponse
+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional, Any
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+import zipfile
+import tempfile
+import os
+import uuid
+from datetime import datetime
+import asyncio
+import logging
+from pathlib import Path
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Pydantic models for API
+class CodeGenerationRequest(BaseModel):
+    description: str = Field(..., description="Description of the backend application to generate")
+    framework: str = Field(..., description="Target framework (express, fastapi, django, flask)")
+    language: str = Field(..., description="Programming language (javascript, python)")
+    requirements: List[str] = Field(default=[], description="List of specific requirements")
+    project_name: Optional[str] = Field(default=None, description="Custom project name")
+    class Config:
+        schema_extra = {
+            "example": {
+                "description": "E-commerce API with user authentication and product management",
+                "framework": "fastapi",
+                "language": "python",
+                "requirements": [
+                    "User registration and login",
+                    "JWT authentication",
+                    "Product CRUD operations",
+                    "Shopping cart functionality",
+                    "Order management"
+                ],
+                "project_name": "ecommerce-api"
+            }
+        }
+class GenerationResponse(BaseModel):
+    task_id: str
+    status: str
+    message: str
+    estimated_time: int
+class GenerationStatus(BaseModel):
+    task_id: str
+    status: str  # pending, processing, completed, failed
+    progress: int  # 0-100
+    message: str
+    generated_files: Optional[Dict[str, str]] = None
+    download_url: Optional[str] = None
+    error: Optional[str] = None
+class GeneratedProject(BaseModel):
+    project_name: str
+    framework: str
+    language: str
+    files: Dict[str, str]
+    structure: Dict[str, Any]
+    setup_instructions: List[str]
+    features: List[str]
+# Global model instance
+class ModelManager:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.loaded = False
+    async def load_model(self, model_path: str = "./trained_model"):
+        """Load the trained model asynchronously"""
+        try:
+            logger.info(f"Loading model from {model_path} on {self.device}")
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                device_map="auto" if self.device == "cuda" else None
+            )
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)
+            self.loaded = True
+            logger.info("Model loaded successfully!")
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    def generate_code(self, prompt: str, max_tokens: int = 1024) -> str:
+        """Generate code using the trained model"""
+        if not self.loaded:
+            raise RuntimeError("Model not loaded")
+        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
+        inputs = inputs.to(self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                inputs,
+                max_length=min(max_tokens, 1024),
+                num_return_sequences=1,
+                temperature=0.7,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=self.tokenizer.eos_token_id,
+                repetition_penalty=1.1
+            )
+        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return generated_text[len(self.tokenizer.decode(inputs[0], skip_special_tokens=True)):]
+# Global instances
+model_manager = ModelManager()
+generation_tasks = {}  # Store generation tasks
+# FastAPI app
+app = FastAPI(
+    title="Backend Code Generation API",
+    description="AI-powered backend application generator",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Load model on startup"""
+    model_path = os.getenv("MODEL_PATH", "./trained_model")
+    await model_manager.load_model(model_path)
+@app.get("/")
+async def root():
+    """API root endpoint"""
+    return {
+        "service": "Backend Code Generation API",
+        "version": "1.0.0",
+        "status": "running",
+        "model_loaded": model_manager.loaded,
+        "endpoints": {
+            "generate": "/api/v1/generate",
+            "status": "/api/v1/status/{task_id}",
+            "download": "/api/v1/download/{task_id}",
+            "health": "/health"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "OK",
+        "timestamp": datetime.utcnow().isoformat(),
+        "model_loaded": model_manager.loaded,
+        "device": model_manager.device if model_manager.loaded else None
+    }
+@app.post("/api/v1/generate", response_model=GenerationResponse)
+async def generate_backend(
+    request: CodeGenerationRequest,
+    background_tasks: BackgroundTasks
+):
+    """Generate a complete backend application"""
+    if not model_manager.loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    # Create unique task ID
+    task_id = str(uuid.uuid4())
+    # Initialize task status
+    generation_tasks[task_id] = GenerationStatus(
+        task_id=task_id,
+        status="pending",
+        progress=0,
+        message="Task queued for processing"
+    )
+    # Start background generation
+    background_tasks.add_task(
+        generate_project_background,
+        task_id,
+        request
+    )
+    return GenerationResponse(
+        task_id=task_id,
+        status="accepted",
+        message="Code generation started",
+        estimated_time=60  # seconds
+    )
+@app.get("/api/v1/status/{task_id}", response_model=GenerationStatus)
+async def get_generation_status(task_id: str):
+    """Get the status of a generation task"""
+    if task_id not in generation_tasks:
+        raise HTTPException(status_code=404, detail="Task not found")
+    return generation_tasks[task_id]
+@app.get("/api/v1/download/{task_id}")
+async def download_generated_project(task_id: str):
+    """Download the generated project as a ZIP file"""
+    if task_id not in generation_tasks:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = generation_tasks[task_id]
+    if task.status != "completed":
+        raise HTTPException(status_code=400, detail="Generation not completed")
+    if not task.download_url:
+        raise HTTPException(status_code=404, detail="Download file not available")
+    if not os.path.exists(task.download_url):
+        raise HTTPException(status_code=404, detail="Download file not found")
+    return FileResponse(
+        path=task.download_url,
+        filename=f"generated_project_{task_id}.zip",
+        media_type="application/zip"
+    )
+@app.delete("/api/v1/cleanup/{task_id}")
+async def cleanup_task(task_id: str):
+    """Clean up task files and data"""
+    if task_id not in generation_tasks:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = generation_tasks[task_id]
+    # Remove download file if exists
+    if task.download_url and os.path.exists(task.download_url):
+        os.remove(task.download_url)
+    # Remove task from memory
+    del generation_tasks[task_id]
+    return {"message": "Task cleaned up successfully"}
+async def generate_project_background(task_id: str, request: CodeGenerationRequest):
+    """Background task for generating the complete project"""
+    task = generation_tasks[task_id]
+    try:
+        # Update status
+        task.status = "processing"
+        task.progress = 10
+        task.message = "Analyzing requirements..."
+        # Create the generation prompt
+        prompt = create_generation_prompt(request)
+        # Update progress
+        task.progress = 30
+        task.message = "Generating application structure..."
+        # Generate code using the model
+        generated_code = model_manager.generate_code(prompt, max_tokens=1024)
+        # Update progress
+        task.progress = 60
+        task.message = "Processing generated code..."
+        # Parse and structure the generated code
+        project_files = parse_generated_code(generated_code, request)
+        # Update progress
+        task.progress = 80
+        task.message = "Creating project files..."
+        # Create downloadable ZIP file
+        zip_path = create_project_zip(task_id, project_files, request)
+        # Complete the task
+        task.status = "completed"
+        task.progress = 100
+        task.message = "Project generated successfully"
+        task.generated_files = {name: "Generated" for name in project_files.keys()}
+        task.download_url = zip_path
+    except Exception as e:
+        logger.error(f"Generation failed for task {task_id}: {e}")
+        task.status = "failed"
+        task.error = str(e)
+        task.message = "Generation failed"
+def create_generation_prompt(request: CodeGenerationRequest) -> str:
+    """Create the prompt for the model"""
+    prompt_parts = [
+        f"Description: {request.description}",
+        f"Framework: {request.framework}",
+        f"Language: {request.language}",
+    ]
+    if request.requirements:
+        prompt_parts.append("Requirements:")
+        for req in request.requirements:
+            prompt_parts.append(f"- {req}")
+    if request.project_name:
+        prompt_parts.append(f"Project Name: {request.project_name}")
+    prompt_parts.append("Generate the complete backend application with all necessary files:")
+    return "\n".join(prompt_parts)
+def parse_generated_code(generated_code: str, request: CodeGenerationRequest) -> Dict[str, str]:
+    """Parse the generated code into individual files"""
+    files = {}
+    # Simple parsing logic - in production, this should be more sophisticated
+    lines = generated_code.split('\n')
+    current_file = None
+    current_content = []
+    for line in lines:
+        if line.startswith('--- ') and line.endswith(' ---'):
+            # Save previous file
+            if current_file:
+                files[current_file] = '\n'.join(current_content)
+            # Start new file
+            current_file = line.replace('--- ', '').replace(' ---', '').strip()
+            current_content = []
+        elif current_file and not line.startswith('--- End ---'):
+            current_content.append(line)
+    # Save last file
+    if current_file and current_content:
+        files[current_file] = '\n'.join(current_content)
+    # If parsing failed, create basic structure based on framework
+    if not files:
+        files = create_fallback_structure(request)
+    return files
+def create_fallback_structure(request: CodeGenerationRequest) -> Dict[str, str]:
+    """Create a basic project structure if parsing fails"""
+    if request.framework.lower() == 'fastapi':
+        return {
+            'main.py': f'''from fastapi import FastAPI
+app = FastAPI(title="{request.description}")
+@app.get("/")
+async def root():
+    return {{"message": "Hello from {request.description}"}}
+@app.get("/health")
+async def health():
+    return {{"status": "OK"}}
+''',
+            'requirements.txt': '''fastapi==0.104.1
+uvicorn[standard]==0.24.0'''
+        }
+    elif request.framework.lower() == 'express':
+        return {
+            'app.js': f'''const express = require('express');
+const app = express();
+app.get('/', (req, res) => {{
+    res.json({{ message: 'Hello from {request.description}' }});
+}});
+app.get('/health', (req, res) => {{
+    res.json({{ status: 'OK' }});
+}});
+const PORT = process.env.PORT || 3000;
+app.listen(PORT, () => {{
+    console.log(`Server running on port ${{PORT}}`);
+}});
+''',
+            'package.json': json.dumps({
+                "name": request.project_name or "generated-backend",
+                "version": "1.0.0",
+                "main": "app.js",
+                "dependencies": {
+                    "express": "^4.18.2"
+                }
+            }, indent=2)
+        }
+    else:
+        return {
+            'README.md': f'# {request.description}\n\nGenerated backend application using {request.framework}'
+        }
+def create_project_zip(task_id: str, files: Dict[str, str], request: CodeGenerationRequest) -> str:
+    """Create a ZIP file containing all project files"""
+    # Create temporary directory for the ZIP file
+    temp_dir = tempfile.gettempdir()
+    zip_path = os.path.join(temp_dir, f"project_{task_id}.zip")
+    project_name = request.project_name or f"generated_{request.framework}_app"
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for filename, content in files.items():
+            # Add each file to the ZIP
+            arcname = f"{project_name}/{filename}"
+            zipf.writestr(arcname, content)
+        # Add a README with setup instructions
+        setup_instructions = get_setup_instructions(request.framework)
+        zipf.writestr(f"{project_name}/SETUP.md", setup_instructions)
+    return zip_path
+def get_setup_instructions(framework: str) -> str:
+    """Get setup instructions for the framework"""
+    instructions = {
+        'fastapi': '''# Setup Instructions
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. Run the application:
+   ```bash
+   uvicorn main:app --reload
+   ```
+3. Access the API:
+   - API: http://localhost:8000
+   - Docs: http://localhost:8000/docs
+''',
+        'express': '''# Setup Instructions
+1. Install dependencies:
+   ```bash
+   npm install
+   ```
+2. Run the application:
+   ```bash
+   node app.js
+   ```
+3. Access the API:
+   - API: http://localhost:3000
+''',
+        'django': '''# Setup Instructions
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. Run migrations:
+   ```bash
+   python manage.py migrate
+   ```
+3. Run the application:
+   ```bash
+   python manage.py runserver
+   ```
+4. Access the API:
+   - API: http://localhost:8000
+   - Admin: http://localhost:8000/admin
+''',
+        'flask': '''# Setup Instructions
+1. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. Run the application:
+   ```bash
+   python run.py
+   ```
+3. Access the API:
+   - API: http://localhost:5000
+'''
+    }
+    return instructions.get(framework, '# Setup Instructions\n\nRefer to the framework documentation for setup instructions.')
+# Additional utility endpoints
+@app.get("/api/v1/frameworks")
+async def list_supported_frameworks():
+    """List supported frameworks and languages"""
+    return {
+        "frameworks": [
+            {
+                "name": "fastapi",
+                "language": "python",
+                "description": "Modern, fast, web framework for building APIs"
+            },
+            {
+                "name": "express",
+                "language": "javascript",
+                "description": "Fast, unopinionated web framework for Node.js"
+            },
+            {
+                "name": "django",
+                "language": "python",
+                "description": "High-level Python web framework"
+            },
+            {
+                "name": "flask",
+                "language": "python",
+                "description": "Lightweight WSGI web application framework"
+            }
+        ]
+    }
+@app.get("/api/v1/examples")
+async def get_example_requests():
+    """Get example generation requests"""
+    return {
+        "examples": [
+            {
+                "name": "E-commerce API",
+                "request": {
+                    "description": "Complete e-commerce backend with user management and product catalog",
+                    "framework": "fastapi",
+                    "language": "python",
+                    "requirements": [
+                        "User registration and authentication",
+                        "Product CRUD operations",
+                        "Shopping cart functionality",
+                        "Order management",
+                        "Payment processing integration"
+                    ]
+                }
+            },
+            {
+                "name": "Task Management System",
+                "request": {
+                    "description": "Task management system with team collaboration",
+                    "framework": "express",
+                    "language": "javascript",
+                    "requirements": [
+                        "User authentication with JWT",
+                        "Task CRUD operations",
+                        "Team and project management",
+                        "Real-time notifications",
+                        "File attachments"
+                    ]
+                }
+            },
+            {
+                "name": "Blog Platform",
+                "request": {
+                    "description": "Blog platform with content management",
+                    "framework": "django",
+                    "language": "python",
+                    "requirements": [
+                        "Article management",
+                        "User comments and ratings",
+                        "Category and tag system",
+                        "SEO optimization",
+                        "Media file handling"
+                    ]
+                }
+            }
+        ]
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "api_service:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True
+    )

data/.gitkeep ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

evaluation/.gitkeep ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

models/.gitkeep ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

raw_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+transformers
+datasets
+pandas
+numpy
+aiohttp
+requests
+accelerate
+fastapi
+uvicorn
+python-multipart

setup-guide.md ADDED Viewed

	@@ -0,0 +1,342 @@

+# Backend Code Generation Model - Setup & Usage Guide
+## 🛠️ Installation & Setup
+### 1. Install Dependencies
+```bash
+pip install torch transformers datasets pandas numpy aiohttp requests
+pip install accelerate  # For faster training
+```
+### 2. Set Environment Variables
+```bash
+# Optional: GitHub token for collecting real repositories
+export GITHUB_TOKEN="your_github_token_here"
+# For GPU training (if available)
+export CUDA_VISIBLE_DEVICES=0
+```
+### 3. Directory Structure
+```
+backend-ai-trainer/
+├── training_pipeline.py          # Main pipeline code
+├── data/
+│   ├── raw_dataset.json         # Collected training data
+│   └── processed/               # Preprocessed data
+├── models/
+│   ├── backend_code_model/      # Trained model output
+│   └── checkpoints/             # Training checkpoints
+└── evaluation/
+    ├── test_cases.json          # Test scenarios
+    └── results/                 # Evaluation results
+```
+## 🏃‍♂️ Quick Start
+### Option A: Full Automated Pipeline
+```python
+import asyncio
+from training_pipeline import TrainingPipeline
+config = {
+    'base_model': 'microsoft/DialoGPT-medium',
+    'output_dir': './models/backend_code_model',
+    'github_token': 'your_token_here',  # Optional
+}
+pipeline = TrainingPipeline(config)
+asyncio.run(pipeline.run_full_pipeline())
+```
+### Option B: Step-by-Step Execution
+#### Step 1: Collect Training Data
+```python
+from training_pipeline import DataCollector
+import asyncio
+collector = DataCollector()
+# Collect from GitHub (requires token)
+github_queries = [
+    'express api backend',
+    'fastapi python backend',
+    'django rest api',
+    'nodejs backend server',
+    'flask api backend'
+]
+asyncio.run(collector.collect_github_repositories(github_queries, max_repos=100))
+# Generate synthetic examples
+collector.generate_synthetic_examples(count=500)
+# Save dataset
+collector.save_dataset('training_data.json')
+```
+#### Step 2: Preprocess Data
+```python
+from training_pipeline import DataPreprocessor
+preprocessor = DataPreprocessor()
+processed_examples = preprocessor.preprocess_examples(collector.collected_examples)
+training_dataset = preprocessor.create_training_dataset(processed_examples)
+print(f"Created dataset with {len(training_dataset)} examples")
+```
+#### Step 3: Train Model
+```python
+from training_pipeline import CodeGenerationModel
+model = CodeGenerationModel('microsoft/DialoGPT-medium')
+model.fine_tune(training_dataset, output_dir='./trained_model')
+```
+#### Step 4: Generate Code
+```python
+# Generate a complete backend application
+generated_code = model.generate_code(
+    description="E-commerce API with user authentication and product management",
+    framework="fastapi",
+    language="python"
+)
+print("Generated Backend Application:")
+print("=" * 50)
+print(generated_code)
+```
+## 🎯 Training Configuration Options
+### Model Selection
+```python
+# Lightweight for testing
+config['base_model'] = 'microsoft/DialoGPT-small'
+# Balanced performance
+config['base_model'] = 'microsoft/DialoGPT-medium'
+# High quality (requires more resources)
+config['base_model'] = 'microsoft/DialoGPT-large'
+```
+### Training Parameters
+```python
+training_config = {
+    'num_epochs': 5,           # More epochs = better learning
+    'batch_size': 4,           # Adjust based on GPU memory
+    'learning_rate': 5e-5,     # Conservative learning rate
+    'max_length': 2048,        # Maximum token length
+    'warmup_steps': 500,       # Learning rate warmup
+    'save_steps': 1000,        # Checkpoint frequency
+}
+```
+### Framework Coverage
+The pipeline supports these backend frameworks:
+**Node.js Frameworks:**
+- Express.js - Most popular Node.js framework
+- NestJS - Enterprise-grade framework
+- Koa.js - Lightweight alternative
+**Python Frameworks:**
+- FastAPI - Modern, high-performance API framework
+- Django - Full-featured web framework
+- Flask - Lightweight and flexible
+**Go Frameworks:**
+- Gin - HTTP web framework
+- Fiber - Express-inspired framework
+## 📊 Evaluation & Testing
+### Automatic Quality Assessment
+```python
+from training_pipeline import ModelEvaluator
+evaluator = ModelEvaluator()
+# Test specific code generation
+generated_code = model.generate_code(
+    description="User authentication API with JWT tokens",
+    framework="express",
+    language="javascript"
+)
+# Get quality scores
+quality_scores = evaluator.evaluate_code_quality(generated_code, "javascript")
+print(f"Syntax Correctness: {quality_scores['syntax_correctness']:.2f}")
+print(f"Completeness: {quality_scores['completeness']:.2f}")
+print(f"Best Practices: {quality_scores['best_practices']:.2f}")
+```
+### Comprehensive Benchmarking
+```python
+test_cases = [
+    {
+        'description': 'REST API for task management with user authentication',
+        'framework': 'express',
+        'language': 'javascript'
+    },
+    {
+        'description': 'GraphQL API for social media platform',
+        'framework': 'fastapi',
+        'language': 'python'
+    },
+    {
+        'description': 'Microservice for payment processing',
+        'framework': 'gin',
+        'language': 'go'
+    }
+]
+benchmark_results = evaluator.benchmark_model(model, test_cases)
+print("Overall Performance:", benchmark_results)
+```
+## 🚀 Advanced Usage
+### Custom Data Sources
+```python
+# Add your own training examples
+custom_examples = [
+    {
+        'description': 'Custom API requirement',
+        'requirements': ['Custom feature 1', 'Custom feature 2'],
+        'framework': 'fastapi',
+        'language': 'python',
+        'code_files': {
+            'main.py': '# Your custom code here',
+            'requirements.txt': 'fastapi\nuvicorn'
+        }
+    }
+]
+# Add to training data
+collector.collected_examples.extend([CodeExample(**ex) for ex in custom_examples])
+```
+### Fine-tuning on Specific Domains
+```python
+# Focus training on specific application types
+domain_specific_queries = [
+    'microservices architecture',
+    'api gateway implementation',
+    'database orm integration',
+    'authentication middleware',
+    'rate limiting api'
+]
+asyncio.run(collector.collect_github_repositories(domain_specific_queries))
+```
+### Export Trained Model
+```python
+# Save model for deployment
+model.model.save_pretrained('./production_model')
+model.tokenizer.save_pretrained('./production_model')
+# Load for inference
+from transformers import AutoModelForCausalLM, AutoTokenizer
+production_model = AutoModelForCausalLM.from_pretrained('./production_model')
+production_tokenizer = AutoTokenizer.from_pretrained('./production_model')
+```
+## 🔧 Troubleshooting
+### Common Issues
+**1. Out of Memory Errors**
+```python
+# Reduce batch size
+config['per_device_train_batch_size'] = 1
+config['gradient_accumulation_steps'] = 4
+# Use gradient checkpointing
+config['gradient_checkpointing'] = True
+```
+**2. Slow Training**
+```python
+# Enable mixed precision (if GPU supports it)
+config['fp16'] = True
+# Use multiple GPUs
+config['dataloader_num_workers'] = 4
+```
+**3. Poor Code Quality**
+```python
+# Increase training data diversity
+collector.generate_synthetic_examples(count=1000)
+# Extend training duration
+config['num_train_epochs'] = 10
+```
+### Performance Optimization
+**For CPU Training:**
+```python
+config['dataloader_pin_memory'] = False
+config['per_device_train_batch_size'] = 1
+```
+**For GPU Training:**
+```python
+config['fp16'] = True
+config['dataloader_pin_memory'] = True
+config['per_device_train_batch_size'] = 4
+```
+## 📈 Expected Results
+After training on ~500-1000 examples, you should expect:
+- **Syntax Correctness**: 85-95%
+- **Code Completeness**: 80-90%
+- **Best Practices**: 70-85%
+- **Framework Coverage**: All major Node.js and Python frameworks
+- **Generation Speed**: 2-5 seconds per application
+## 🔄 Continuous Improvement
+### Regular Retraining
+```python
+# Schedule weekly data collection
+import schedule
+def update_training_data():
+    asyncio.run(collector.collect_github_repositories(['new backend trends']))
+schedule.every().week.do(update_training_data)
+```
+### A/B Testing Different Models
+```python
+models_to_compare = [
+    'microsoft/DialoGPT-medium',
+    'microsoft/DialoGPT-large',
+    'gpt2-medium'
+]
+for base_model in models_to_compare:
+    model = CodeGenerationModel(base_model)
+    results = evaluator.benchmark_model(model, test_cases)
+    print(f"{base_model}: {results}")
+```
+## 🎯 Next Steps
+1. **Start Small**: Begin with synthetic data and 100-200 examples
+2. **Add Real Data**: Integrate GitHub repositories gradually
+3. **Evaluate Regularly**: Monitor quality metrics after each training session
+4. **Expand Frameworks**: Add support for new frameworks as needed
+5. **Production Deploy**: Export model for API deployment
+This pipeline provides a complete foundation for building your own backend code generation AI. The modular design allows you to customize and extend each component based on your specific needs.

test_api.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#!/usr/bin/env python3
+"""
+Test script for the Backend Code Generation API
+===============================================
+Simple test script to verify the API is working correctly.
+"""
+import requests
+import json
+import time
+import os
+# API base URL
+BASE_URL = "http://localhost:8000"
+def test_health():
+    """Test the health endpoint"""
+    print("Testing health endpoint...")
+    response = requests.get(f"{BASE_URL}/health")
+    print(f"Status: {response.status_code}")
+    print(f"Response: {response.json()}")
+    return response.status_code == 200
+def test_generate_code():
+    """Test code generation"""
+    print("\nTesting code generation...")
+    # Test request
+    request_data = {
+        "description": "Simple REST API for task management",
+        "framework": "fastapi",
+        "language": "python",
+        "requirements": [
+            "User authentication",
+            "Task CRUD operations",
+            "Task status tracking"
+        ],
+        "project_name": "task-manager-api"
+    }
+    # Submit generation request
+    response = requests.post(f"{BASE_URL}/api/v1/generate", json=request_data)
+    print(f"Generation request status: {response.status_code}")
+    if response.status_code == 200:
+        result = response.json()
+        task_id = result["task_id"]
+        print(f"Task ID: {task_id}")
+        # Poll for completion
+        print("Polling for completion...")
+        for i in range(30):  # Wait up to 5 minutes
+            status_response = requests.get(f"{BASE_URL}/api/v1/status/{task_id}")
+            if status_response.status_code == 200:
+                status = status_response.json()
+                print(f"Status: {status['status']} - {status['message']} ({status['progress']}%)")
+                if status["status"] == "completed":
+                    print("✅ Generation completed!")
+                    if status.get("download_url"):
+                        print(f"Download URL: {status['download_url']}")
+                    return True
+                elif status["status"] == "failed":
+                    print(f"❌ Generation failed: {status.get('error', 'Unknown error')}")
+                    return False
+            else:
+                print(f"Failed to get status: {status_response.status_code}")
+                return False
+            time.sleep(10)  # Wait 10 seconds between polls
+        print("⏰ Timeout waiting for completion")
+        return False
+    else:
+        print(f"❌ Generation request failed: {response.text}")
+        return False
+def test_frameworks():
+    """Test frameworks endpoint"""
+    print("\nTesting frameworks endpoint...")
+    response = requests.get(f"{BASE_URL}/api/v1/frameworks")
+    print(f"Status: {response.status_code}")
+    if response.status_code == 200:
+        frameworks = response.json()
+        print(f"Supported frameworks: {len(frameworks['frameworks'])}")
+        for fw in frameworks['frameworks']:
+            print(f"  - {fw['name']} ({fw['language']})")
+        return True
+    return False
+def test_examples():
+    """Test examples endpoint"""
+    print("\nTesting examples endpoint...")
+    response = requests.get(f"{BASE_URL}/api/v1/examples")
+    print(f"Status: {response.status_code}")
+    if response.status_code == 200:
+        examples = response.json()
+        print(f"Available examples: {len(examples['examples'])}")
+        for ex in examples['examples']:
+            print(f"  - {ex['name']}")
+        return True
+    return False
+def main():
+    """Run all tests"""
+    print("🚀 Testing Backend Code Generation API")
+    print("=" * 50)
+    # Check if API is running
+    try:
+        response = requests.get(f"{BASE_URL}/", timeout=5)
+        if response.status_code != 200:
+            print("❌ API is not running. Please start it with: python api_service.py")
+            return
+    except requests.exceptions.RequestException:
+        print("❌ Cannot connect to API. Please start it with: python api_service.py")
+        return
+    print("✅ API is running")
+    # Run tests
+    tests = [
+        ("Health Check", test_health),
+        ("Frameworks List", test_frameworks),
+        ("Examples List", test_examples),
+        ("Code Generation", test_generate_code),
+    ]
+    results = []
+    for test_name, test_func in tests:
+        print(f"\n{'='*20} {test_name} {'='*20}")
+        try:
+            result = test_func()
+            results.append((test_name, result))
+        except Exception as e:
+            print(f"❌ Test failed with error: {e}")
+            results.append((test_name, False))
+    # Summary
+    print(f"\n{'='*50}")
+    print("📊 Test Results Summary:")
+    print("=" * 50)
+    passed = 0
+    for test_name, result in results:
+        status = "✅ PASS" if result else "❌ FAIL"
+        print(f"{test_name}: {status}")
+        if result:
+            passed += 1
+    print(f"\nPassed: {passed}/{len(results)} tests")
+    if passed == len(results):
+        print("🎉 All tests passed!")
+    else:
+        print("⚠️  Some tests failed. Check the output above for details.")
+if __name__ == "__main__":
+    main()

training_pipeline.py ADDED Viewed

	@@ -0,0 +1,772 @@

+#!/usr/bin/env python3
+"""
+Backend Code Generation Model Training Pipeline
+===============================================
+A comprehensive training pipeline for building an AI model that generates
+framework-agnostic backend code with full application scaffolding.
+Features:
+- Data collection from multiple sources
+- Multi-framework support (Express.js, FastAPI, Django, Flask, etc.)
+- Full application scaffolding generation
+- Model training with transformer architecture
+- Evaluation and benchmarking tools
+"""
+import os
+import json
+import logging
+import asyncio
+import aiohttp
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Tuple, Any
+from dataclasses import dataclass, asdict
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from transformers import (
+    AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
+    Trainer, DataCollatorForLanguageModeling
+)
+from datasets import Dataset as HFDataset
+import ast
+import subprocess
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+import requests
+import time
+import random
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+@dataclass
+class CodeExample:
+    """Represents a single training example"""
+    description: str
+    requirements: List[str]
+    framework: str
+    language: str
+    code_files: Dict[str, str]  # filename -> content
+    project_structure: Dict[str, Any]
+    metadata: Dict[str, Any]
+class DataCollector:
+    """Collects training data from various sources"""
+    def __init__(self):
+        self.github_token = os.getenv('GITHUB_TOKEN')
+        self.collected_examples: List[CodeExample] = []
+    async def collect_github_repositories(self, queries: List[str], max_repos: int = 100):
+        """Collect backend projects from GitHub"""
+        logger.info("Starting GitHub repository collection...")
+        headers = {'Authorization': f'token {self.github_token}'} if self.github_token else {}
+        async with aiohttp.ClientSession(headers=headers) as session:
+            per_query = max(1, max_repos // max(1, len(queries)))
+            for query in queries:
+                await self._search_github_repos(session, query, per_query)
+    async def _search_github_repos(self, session: aiohttp.ClientSession, query: str, limit: int):
+        """Search GitHub for repositories matching query"""
+        url = f"https://api.github.com/search/repositories"
+        params = {
+            'q': query,
+            'sort': 'stars',
+            'order': 'desc',
+            'per_page': min(limit, 100)
+        }
+        try:
+            async with session.get(url, params=params) as response:
+                if response.status == 200:
+                    data = await response.json()
+                    for repo in data.get('items', []):
+                        await self._process_repository(session, repo)
+                else:
+                    logger.warning(f"GitHub API request failed: {response.status}")
+        except Exception as e:
+            logger.error(f"Error searching GitHub: {e}")
+    async def _process_repository(self, session: aiohttp.ClientSession, repo: Dict):
+        """Process a single repository to extract code examples"""
+        logger.info(f"Processing repository: {repo.get('full_name', '<unknown>')}")
+        try:
+            contents_url = f"https://api.github.com/repos/{repo['full_name']}/contents"
+            async with session.get(contents_url) as response:
+                if response.status == 200:
+                    contents = await response.json()
+                    await self._extract_code_example(session, repo, contents)
+        except Exception as e:
+            logger.error(f"Error processing repository {repo.get('full_name')}: {e}")
+    async def _extract_code_example(self, session: aiohttp.ClientSession, repo: Dict, contents: List[Dict]):
+        """Extract a structured code example from repository"""
+        framework = self._identify_framework(contents, repo.get('description', ''))
+        language = self._identify_language(contents)
+        if not framework or not language:
+            return
+        code_files: Dict[str, str] = {}
+        for item in contents:
+            if item.get('type') == 'file' and self._is_important_file(item.get('name', '')):
+                try:
+                    async with session.get(item['download_url']) as response:
+                        if response.status == 200:
+                            content = await response.text()
+                            code_files[item['name']] = content
+                except Exception:
+                    continue
+        if code_files:
+            example = CodeExample(
+                description=repo.get('description', ''),
+                requirements=self._extract_requirements(code_files),
+                framework=framework,
+                language=language,
+                code_files=code_files,
+                project_structure=self._analyze_structure(contents),
+                metadata={
+                    'stars': repo.get('stargazers_count', 0),
+                    'forks': repo.get('forks_count', 0),
+                    'url': repo.get('html_url'),
+                    'created_at': repo.get('created_at'),
+                    'updated_at': repo.get('updated_at')
+                }
+            )
+            self.collected_examples.append(example)
+    def _identify_framework(self, contents: List[Dict], description: str) -> Optional[str]:
+        """Identify the backend framework used"""
+        filenames = [item.get('name', '').lower() for item in contents if item.get('type') == 'file']
+        frameworks = {
+            'express': ['package.json', 'app.js', 'server.js'],
+            'fastapi': ['requirements.txt', 'main.py', 'app.py'],
+            'django': ['manage.py', 'settings.py', 'requirements.txt'],
+            'flask': ['app.py', 'requirements.txt'],
+            'nestjs': ['nest-cli.json', 'package.json'],
+            'koa': ['package.json'],
+            'gin': ['go.mod', 'main.go'],
+            'fiber': ['go.mod', 'main.go'],
+        }
+        for framework, required_files in frameworks.items():
+            if all(any(req in filename for filename in filenames) for req in required_files[:2]):
+                return framework
+        desc_lower = description.lower()
+        for framework in frameworks.keys():
+            if framework in desc_lower:
+                return framework
+        return None
+    def _identify_language(self, contents: List[Dict]) -> Optional[str]:
+        """Identify primary programming language"""
+        extensions: Dict[str, int] = {}
+        for item in contents:
+            if item.get('type') == 'file':
+                ext = Path(item.get('name', '')).suffix.lower()
+                if ext:
+                    extensions[ext] = extensions.get(ext, 0) + 1
+        lang_map = {
+            '.js': 'javascript',
+            '.ts': 'typescript',
+            '.py': 'python',
+            '.go': 'go',
+            '.java': 'java',
+            '.cs': 'csharp',
+            '.rb': 'ruby',
+            '.php': 'php'
+        }
+        if extensions:
+            most_common_ext = max(extensions.items(), key=lambda x: x[1])[0]
+            return lang_map.get(most_common_ext)
+        return None
+    def _is_important_file(self, filename: str) -> bool:
+        """Check if file is important for training"""
+        important_patterns = [
+            'package.json', 'requirements.txt', 'go.mod', 'pom.xml',
+            'dockerfile', 'docker-compose.yml', 'readme.md',
+            'app.py', 'main.py', 'server.js', 'app.js', 'index.js',
+            'settings.py', 'config.py', 'routes.py', 'models.py',
+            'controller.js', 'service.js', 'middleware.js'
+        ]
+        filename_lower = filename.lower()
+        return any(pattern in filename_lower for pattern in important_patterns)
+    def _extract_requirements(self, code_files: Dict[str, str]) -> List[str]:
+        """Extract functional requirements from code"""
+        requirements: List[str] = []
+        if 'package.json' in code_files:
+            try:
+                pkg_data = json.loads(code_files['package.json'])
+                deps = list(pkg_data.get('dependencies', {}).keys())
+                requirements.extend([f"Uses {dep}" for dep in deps[:5]])
+            except Exception:
+                pass
+        if 'requirements.txt' in code_files:
+            lines = code_files['requirements.txt'].strip().split('\n')
+            deps = [line.split('==')[0].split('>=')[0].strip() for line in lines if line.strip()]
+            requirements.extend([f"Uses {dep}" for dep in deps[:5]])
+        for filename, content in code_files.items():
+            if filename.endswith(('.js', '.py')):
+                endpoints = self._extract_endpoints(content)
+                requirements.extend(endpoints)
+        return requirements[:10]
+    def _extract_endpoints(self, code_content: str) -> List[str]:
+        """Extract API endpoints from code"""
+        endpoints: List[str] = []
+        lines = code_content.split('\n')
+        for line in lines:
+            s = line.strip()
+            if any(method in s for method in ['app.get(', 'app.post(', 'app.put(', 'app.delete(']):
+                endpoints.append(f"Implements {s}")
+            elif any(decorator in s for decorator in ['@app.get(', '@app.post(', '@app.put(', '@app.delete(']):
+                endpoints.append(f"Implements {s}")
+            elif 'def ' in s and any(word in s for word in ['get', 'post', 'put', 'delete']):
+                endpoints.append(f"Implements {s}")
+        return endpoints[:5]
+    def _analyze_structure(self, contents: List[Dict]) -> Dict[str, Any]:
+        """Analyze project structure"""
+        structure: Dict[str, Any] = {
+            'files': [],
+            'directories': [],
+            'total_files': 0,
+            'has_tests': False,
+            'has_docs': False
+        }
+        for item in contents:
+            if item.get('type') == 'file':
+                name = item.get('name', '')
+                structure['files'].append(name)
+                structure['total_files'] += 1
+                if 'test' in name.lower():
+                    structure['has_tests'] = True
+                if name.lower() in ['readme.md', 'docs.md']:
+                    structure['has_docs'] = True
+            elif item.get('type') == 'dir':
+                structure['directories'].append(item.get('name', ''))
+        return structure
+    def generate_synthetic_examples(self, count: int = 100):
+        """Generate synthetic training examples"""
+        logger.info(f"Generating {count} synthetic examples...")
+        templates = [
+            {
+                'description': 'REST API for user management',
+                'requirements': ['User registration', 'User authentication', 'Profile management'],
+                'frameworks': ['express', 'fastapi', 'django']
+            },
+            {
+                'description': 'E-commerce backend API',
+                'requirements': ['Product catalog', 'Shopping cart', 'Order processing', 'Payment integration'],
+                'frameworks': ['nestjs', 'fastapi', 'django']
+            },
+            {
+                'description': 'Task management system',
+                'requirements': ['Task CRUD operations', 'User assignments', 'Status tracking'],
+                'frameworks': ['express', 'flask', 'gin']
+            },
+            {
+                'description': 'Blog platform backend',
+                'requirements': ['Article management', 'User comments', 'Category system'],
+                'frameworks': ['express', 'django', 'fastapi']
+            }
+        ]
+        for _ in range(count):
+            template = random.choice(templates)
+            framework = random.choice(template['frameworks'])
+            code_files = self._generate_code_for_template(template, framework)
+            example = CodeExample(
+                description=template['description'],
+                requirements=template['requirements'],
+                framework=framework,
+                language='python' if framework in ['fastapi', 'django', 'flask'] else 'javascript',
+                code_files=code_files,
+                project_structure=self._generate_synthetic_structure(framework),
+                metadata={'synthetic': True}
+            )
+            self.collected_examples.append(example)
+    def _generate_code_for_template(self, template: Dict, framework: str) -> Dict[str, str]:
+        """Generate code files for a template and framework"""
+        if framework == 'express':
+            return {
+                'package.json': json.dumps({
+                    "name": template['description'].lower().replace(' ', '-'),
+                    "version": "1.0.0",
+                    "dependencies": {
+                        "express": "^4.18.0",
+                        "mongoose": "^6.0.0",
+                        "bcrypt": "^5.0.0",
+                        "jsonwebtoken": "^8.5.0"
+                    }
+                }, indent=2),
+                'app.js': '''const express = require('express');
+const mongoose = require('mongoose');
+const app = express();
+// Middleware
+app.use(express.json());
+// Routes
+app.get('/health', (req, res) => {
+    res.json({ status: 'OK' });
+});
+// Start server
+const PORT = process.env.PORT || 3000;
+app.listen(PORT, () => {
+    console.log(`Server running on port ${PORT}`);
+});
+module.exports = app;'''
+            }
+        elif framework == 'fastapi':
+            return {
+                'requirements.txt': '''fastapi==0.68.0
+uvicorn==0.15.0
+sqlalchemy==1.4.23
+pydantic==1.8.2''',
+                'main.py': '''from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+app = FastAPI()
+class Item(BaseModel):
+    id: Optional[int] = None
+    name: str
+    description: str
+@app.get("/")
+async def root():
+    return {"message": "Hello World"}
+@app.get("/health")
+async def health_check():
+    return {"status": "OK"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)'''
+            }
+        else:
+            return {'placeholder.txt': 'Generated code placeholder'}
+    def _generate_synthetic_structure(self, framework: str) -> Dict[str, Any]:
+        """Generate project structure for framework"""
+        if framework in ['express', 'nestjs']:
+            return {
+                'files': ['package.json', 'app.js', 'README.md'],
+                'directories': ['routes', 'controllers', 'middleware', 'models'],
+                'total_files': 3,
+                'has_tests': True,
+                'has_docs': True
+            }
+        elif framework in ['fastapi', 'django', 'flask']:
+            return {
+                'files': ['requirements.txt', 'main.py', 'README.md'],
+                'directories': ['models', 'routes', 'services'],
+                'total_files': 3,
+                'has_tests': True,
+                'has_docs': True
+            }
+        else:
+            return {}
+    def save_dataset(self, filepath: str):
+        """Save collected examples to file"""
+        data = [asdict(example) for example in self.collected_examples]
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Saved {len(data)} examples to {filepath}")
+class DataPreprocessor:
+    """Preprocesses collected data for training"""
+    def __init__(self, tokenizer_name: str = "microsoft/DialoGPT-medium"):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Ensure we do not exceed model's maximum positional embeddings (GPT-2/DialoGPT: 1024)
+        try:
+            model_max = getattr(self.tokenizer, 'model_max_length', 1024)
+            # Some tokenizers set a very large sentinel value; cap at 1024 for GPT-2 family
+            if model_max and model_max > 0 and model_max < 100000:
+                self.max_length = min(1024, int(model_max))
+            else:
+                self.max_length = 1024
+        except Exception:
+            self.max_length = 1024
+    def preprocess_examples(self, examples: List[CodeExample]) -> List[Dict[str, str]]:
+        """Convert examples to training format"""
+        processed: List[Dict[str, str]] = []
+        for example in examples:
+            input_text = self._create_input_text(example)
+            output_text = self._create_output_text(example)
+            processed.append({
+                'input': input_text,
+                'output': output_text,
+                'framework': example.framework,
+                'language': example.language
+            })
+        return processed
+    def _create_input_text(self, example: CodeExample) -> str:
+        """Create model input text"""
+        input_parts: List[str] = [
+            f"Description: {example.description}",
+            f"Framework: {example.framework}",
+            f"Language: {example.language}",
+            "Requirements:",
+        ]
+        for req in example.requirements:
+            input_parts.append(f"- {req}")
+        input_parts.append("Generate the backend application:")
+        return "\n".join(input_parts)
+    def _create_output_text(self, example: CodeExample) -> str:
+        """Create model output text"""
+        output_parts: List[str] = []
+        output_parts.append("Project Structure:")
+        for directory in example.project_structure.get('directories', []):
+            output_parts.append(f"/{directory}/")
+        output_parts.append("\nGenerated Files:")
+        for filename, content in example.code_files.items():
+            output_parts.append(f"\n--- {filename} ---")
+            output_parts.append(content)
+            output_parts.append("--- End ---")
+        return "\n".join(output_parts)
+    def create_training_dataset(self, processed_examples: List[Dict[str, str]]) -> HFDataset:
+        """Create Hugging Face dataset for training"""
+        def tokenize_function(examples: Dict[str, List[str]]):
+            texts: List[str] = []
+            for inp, out in zip(examples['input'], examples['output']):
+                text = f"<|startoftext|>{inp}<|separator|>{out}<|endoftext|>"
+                texts.append(text)
+            return self.tokenizer(
+                texts,
+                truncation=True,
+                padding=True,
+                max_length=self.max_length
+            )
+        dataset_dict = {
+            'input': [ex['input'] for ex in processed_examples],
+            'output': [ex['output'] for ex in processed_examples],
+            'framework': [ex['framework'] for ex in processed_examples],
+            'language': [ex['language'] for ex in processed_examples]
+        }
+        dataset = HFDataset.from_dict(dataset_dict)
+        tokenized_dataset = dataset.map(tokenize_function, batched=True)
+        return tokenized_dataset
+class CodeGenerationModel:
+    """Custom model for backend code generation"""
+    def __init__(self, base_model: str = "microsoft/DialoGPT-medium"):
+        self.base_model = base_model
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
+        self.model = AutoModelForCausalLM.from_pretrained(base_model)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def fine_tune(self, dataset: HFDataset, output_dir: str = "./trained_model"):
+        """Fine-tune the model on backend code generation"""
+        logger.info("Starting model fine-tuning...")
+        training_args = TrainingArguments(
+            output_dir=output_dir,
+            overwrite_output_dir=True,
+            num_train_epochs=1,  # Reduced from 3
+            per_device_train_batch_size=1,  # Reduced from 2 for memory
+            per_device_eval_batch_size=1,   # Reduced from 2
+            warmup_steps=50,     # Reduced from 500
+            max_steps=100,       # Drastically reduced from 2000
+            logging_steps=10,    # More frequent logging
+            save_steps=50,       # More frequent saves
+            save_total_limit=2,
+            prediction_loss_only=True,
+            fp16=torch.cuda.is_available(),
+            dataloader_pin_memory=False,
+            gradient_accumulation_steps=4,  # Accumulate gradients for effective larger batch
+            learning_rate=5e-5,  # Explicit learning rate
+        )
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=self.tokenizer,
+            mlm=False,
+        )
+        train_size = int(0.8 * len(dataset))
+        eval_size = len(dataset) - train_size
+        train_dataset, eval_dataset = torch.utils.data.random_split(
+            dataset, [train_size, eval_size]
+        )
+        trainer = Trainer(
+            model=self.model,
+            args=training_args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+        )
+        trainer.train()
+        trainer.save_model()
+        logger.info("Fine-tuning completed!")
+    def generate_code(self, description: str, framework: str, language: str) -> str:
+        """Generate backend code for given requirements"""
+        input_text = (
+            f"Description: {description}\n"
+            f"Framework: {framework}\n"
+            f"Language: {language}\n"
+            f"Generate the backend application:"
+        )
+        # Respect model's max position embeddings (GPT-2/DialoGPT is typically 1024)
+        model_max_len = getattr(self.tokenizer, 'model_max_length', 1024)
+        max_len = 1024 if model_max_len is None or model_max_len > 100000 else min(1024, int(model_max_len))
+        inputs = self.tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=max_len)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                inputs,
+                max_length=max_len,
+                num_return_sequences=1,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return generated_text[len(input_text):]
+class ModelEvaluator:
+    """Evaluates model performance"""
+    def __init__(self):
+        self.metrics: Dict[str, float] = {}
+    def evaluate_code_quality(self, generated_code: str, language: str) -> Dict[str, float]:
+        """Evaluate generated code quality"""
+        metrics = {
+            'syntax_correctness': self._check_syntax(generated_code, language),
+            'completeness': self._check_completeness(generated_code),
+            'best_practices': self._check_best_practices(generated_code, language)
+        }
+        return metrics
+    def _check_syntax(self, code: str, language: str) -> float:
+        """Check if generated code has valid syntax"""
+        if language == 'python':
+            try:
+                ast.parse(code)
+                return 1.0
+            except SyntaxError:
+                return 0.0
+        elif language == 'javascript':
+            if '{' in code and '}' in code:
+                return 0.8
+            return 0.5
+        return 0.5
+    def _check_completeness(self, code: str) -> float:
+        """Check if code appears complete"""
+        completeness_indicators = [
+            'import', 'require', 'function', 'def', 'class',
+            'app.', 'router.', '@app.', 'app.listen', 'if __name__'
+        ]
+        indicators_found = sum(1 for indicator in completeness_indicators if indicator in code)
+        return min(indicators_found / 3.0, 1.0)
+    def _check_best_practices(self, code: str, language: str) -> float:
+        """Check adherence to best practices"""
+        best_practices_score = 0.0
+        if 'try:' in code or 'catch' in code:
+            best_practices_score += 0.2
+        if any(comment in code for comment in ['#', '//', '/*']):
+            best_practices_score += 0.2
+        if language == 'python':
+            if 'if __name__ == "__main__"' in code:
+                best_practices_score += 0.2
+        elif language == 'javascript':
+            if 'const' in code or 'let' in code:
+                best_practices_score += 0.2
+        return min(best_practices_score, 1.0)
+    def benchmark_model(self, model: 'CodeGenerationModel', test_cases: List[Dict]) -> Dict[str, float]:
+        """Benchmark model on test cases"""
+        total_scores = {'syntax': 0.0, 'completeness': 0.0, 'best_practices': 0.0}
+        for i, test_case in enumerate(test_cases):
+            generated_code = model.generate_code(
+                test_case['description'],
+                test_case['framework'],
+                test_case['language']
+            )
+            scores = self.evaluate_code_quality(generated_code, test_case['language'])
+            total_scores['syntax'] += scores['syntax_correctness']
+            total_scores['completeness'] += scores['completeness']
+            total_scores['best_practices'] += scores['best_practices']
+            logger.info(f"Test case {i+1}: {scores}")
+        num_cases = max(1, len(test_cases))
+        avg_scores = {key: value / num_cases for key, value in total_scores.items()}
+        return avg_scores
+class TrainingPipeline:
+    """Main training pipeline orchestrator"""
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.data_collector = DataCollector()
+        self.preprocessor = DataPreprocessor(config.get('tokenizer', 'microsoft/DialoGPT-medium'))
+        self.model = CodeGenerationModel(config.get('base_model', 'microsoft/DialoGPT-medium'))
+        self.evaluator = ModelEvaluator()
+    async def run_full_pipeline(self):
+        """Run the complete training pipeline"""
+        logger.info("Starting full training pipeline...")
+        logger.info("Step 1: Collecting training data...")
+        if self.data_collector.github_token:
+            github_queries = [
+                'express api backend',
+                'fastapi python backend',
+                'django rest api',
+                'nodejs backend server',
+                'flask api backend'
+            ]
+            await self.data_collector.collect_github_repositories(github_queries, max_repos=50)
+        self.data_collector.generate_synthetic_examples(count=200)
+        self.data_collector.save_dataset('raw_dataset.json')
+        logger.info("Step 2: Preprocessing data...")
+        processed_examples = self.preprocessor.preprocess_examples(self.data_collector.collected_examples)
+        training_dataset = self.preprocessor.create_training_dataset(processed_examples)
+        logger.info("Step 3: Training model...")
+        self.model.fine_tune(training_dataset, output_dir=self.config.get('output_dir', './trained_model'))
+        logger.info("Step 4: Evaluating model...")
+        test_cases = [
+            {
+                'description': 'REST API for user management with authentication',
+                'framework': 'express',
+                'language': 'javascript'
+            },
+            {
+                'description': 'FastAPI backend for e-commerce platform',
+                'framework': 'fastapi',
+                'language': 'python'
+            },
+            {
+                'description': 'Django REST API for blog platform',
+                'framework': 'django',
+                'language': 'python'
+            }
+        ]
+        benchmark_results = self.evaluator.benchmark_model(self.model, test_cases)
+        logger.info(f"Benchmark results: {benchmark_results}")
+        logger.info("Training pipeline completed!")
+        return benchmark_results
+if __name__ == "__main__":
+    config = {
+        'base_model': 'microsoft/DialoGPT-medium',
+        'tokenizer': 'microsoft/DialoGPT-medium',
+        'output_dir': './backend_code_model',
+        'github_token': os.getenv('GITHUB_TOKEN'),
+    }
+    pipeline = TrainingPipeline(config)
+    asyncio.run(pipeline.run_full_pipeline())
+    logger.info("\nTesting trained model...")
+    generated_code = pipeline.model.generate_code(
+        description="Create a REST API for managing tasks with CRUD operations",
+        framework="express",
+        language="javascript"
+    )
+    print("\nGenerated Code:")
+    print("=" * 50)
+    print(generated_code)