akshaynayaks9845
/

rml-ai-phi1_5-rml-100k

+"""
+Memory Store for RML System
+Handles vector storage and semantic search
+"""
+import numpy as np
+from typing import List, Dict, Any, Optional, Callable
+from sklearn.metrics.pairwise import cosine_similarity
+class MemoryStore:
+    """Vector-based memory store for semantic search"""
+    def __init__(self):
+        self.entries = []
+        self.embeddings = None
+        self.encode_query_fn: Optional[Callable] = None
+    def add_entries(self, entries: List[Dict[str, Any]], embeddings: np.ndarray):
+        """Add entries with their embeddings"""
+        self.entries = entries
+        self.embeddings = embeddings
+    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """Search for relevant entries using semantic similarity"""
+        if not self.entries or self.embeddings is None:
+            return []
+        if not self.encode_query_fn:
+            # Fallback to keyword search
+            return self._keyword_search(query, top_k)
+        try:
+            # Encode query
+            query_embedding = self.encode_query_fn(query)
+            # Handle empty embeddings
+            if self.embeddings is None or len(self.embeddings) == 0:
+                return self._keyword_search(query, top_k)
+            # Ensure proper dimensions
+            if len(self.embeddings.shape) == 1:
+                # If embeddings is 1D, reshape to 2D
+                embeddings = self.embeddings.reshape(1, -1)
+            else:
+                embeddings = self.embeddings
+            if len(query_embedding.shape) == 1:
+                query_embedding = query_embedding.reshape(1, -1)
+            # Check dimension compatibility
+            if query_embedding.shape[1] != embeddings.shape[1]:
+                print(f"Embedding dimension mismatch: query {query_embedding.shape[1]} vs entries {embeddings.shape[1]}")
+                return self._keyword_search(query, top_k)
+            # Calculate similarities
+            similarities = cosine_similarity(query_embedding, embeddings)[0]
+            # Get top-k results
+            top_indices = np.argsort(similarities)[::-1][:top_k]
+            results = []
+            for idx in top_indices:
+                if similarities[idx] > 0.1:  # Minimum similarity threshold
+                    entry = self.entries[idx].copy()
+                    entry['text'] = self._extract_text(entry)
+                    entry['similarity'] = float(similarities[idx])
+                    entry['source'] = entry.get('source', 'internal dataset')
+                    results.append(entry)
+            return results
+        except Exception as e:
+            print(f"Error during search: {e}")
+            return self._keyword_search(query, top_k)
+    def _keyword_search(self, query: str, top_k: int) -> List[Dict[str, Any]]:
+        """Fallback keyword search with RML-aware scoring"""
+        query_lower = query.lower()
+        query_words = set(query_lower.split())
+        results = []
+        for entry in self.entries:
+            score = 0
+            text = self._extract_text(entry).lower()
+            # Check direct text matches
+            text_words = set(text.split())
+            common_words = query_words.intersection(text_words)
+            score += len(common_words) * 2  # Base score for word matches
+            # Boost score for matches in specific RML fields
+            if 'concepts' in entry and entry['concepts']:
+                concepts_text = " ".join(entry['concepts']).lower() if isinstance(entry['concepts'], list) else str(entry['concepts']).lower()
+                concept_matches = sum(1 for word in query_words if word in concepts_text)
+                score += concept_matches * 3  # Higher weight for concept matches
+            if 'tags' in entry and entry['tags']:
+                tags_text = " ".join(entry['tags']).lower() if isinstance(entry['tags'], list) else str(entry['tags']).lower()
+                tag_matches = sum(1 for word in query_words if word in tags_text)
+                score += tag_matches * 2  # Medium weight for tag matches
+            if 'summaries' in entry and entry['summaries']:
+                summary_text = entry['summaries'][0].lower() if isinstance(entry['summaries'], list) and entry['summaries'] else str(entry['summaries']).lower()
+                summary_matches = sum(1 for word in query_words if word in summary_text)
+                score += summary_matches * 4  # Highest weight for summary matches
+            # Only include results with some relevance
+            if score > 0:
+                entry_copy = entry.copy()
+                entry_copy['text'] = self._extract_text(entry)
+                entry_copy['similarity'] = min(0.9, score / 10)  # Normalize score to similarity
+                entry_copy['source'] = entry.get('source', 'internal dataset')
+                results.append(entry_copy)
+        # Sort by similarity score and return top-k
+        results.sort(key=lambda x: x['similarity'], reverse=True)
+        return results[:top_k]
+    def _extract_text(self, entry: Dict[str, Any]) -> str:
+        """Extract text content from entry, handling RML-specific structure"""
+        # First try standard fields
+        for field in ['text', 'content', 'body', 'chunk', 'summary', 'title']:
+            if field in entry and entry[field]:
+                return str(entry[field])
+        # Handle RML-specific structure
+        text_parts = []
+        # Extract from summaries (first priority for RML data)
+        if 'summaries' in entry and entry['summaries']:
+            if isinstance(entry['summaries'], list) and entry['summaries']:
+                text_parts.append(entry['summaries'][0])
+            elif isinstance(entry['summaries'], str):
+                text_parts.append(entry['summaries'])
+        # Extract from concepts
+        if 'concepts' in entry and entry['concepts']:
+            if isinstance(entry['concepts'], list):
+                text_parts.append(" ".join(entry['concepts'][:10]))  # First 10 concepts
+            elif isinstance(entry['concepts'], str):
+                text_parts.append(entry['concepts'])
+        # Extract from tags
+        if 'tags' in entry and entry['tags']:
+            if isinstance(entry['tags'], list):
+                text_parts.append(" ".join(entry['tags'][:10]))  # First 10 tags
+            elif isinstance(entry['tags'], str):
+                text_parts.append(entry['tags'])
+        # Combine all parts
+        if text_parts:
+            return " ".join(text_parts)
+        # Fallback: convert entire entry to string (excluding large arrays)
+        filtered_entry = {}
+        for k, v in entry.items():
+            if k not in ['vectors', 'embeddings'] and v:
+                if isinstance(v, list) and len(v) > 20:
+                    filtered_entry[k] = v[:5]  # Only first 5 items of large lists
+                else:
+                    filtered_entry[k] = v
+        return str(filtered_entry) if filtered_entry else "No content available"
+    def get_stats(self) -> Dict[str, Any]:
+        """Get memory store statistics"""
+        embedding_dim = 0
+        if self.embeddings is not None and len(self.embeddings.shape) > 1:
+            embedding_dim = self.embeddings.shape[1]
+        elif self.embeddings is not None and len(self.embeddings.shape) == 1:
+            embedding_dim = len(self.embeddings)
+        return {
+            'total_entries': len(self.entries),
+            'embedding_dim': embedding_dim,
+            'has_embeddings': self.embeddings is not None
+        }