RAG 시스템 실전 구축 (v3)
개요
此指南详述构建实战 RAG(Retrieval-Augmented Generation) 之所需。为使 ML 工程师与后端开发者能依数据应答有义之问,提供实用之码与方略.
1. RAG 基础之理
RAG 乃检索生成之系统,经二要步:
检索
于文索相关之讯.
增强
所搜信息,增补于提示,使大语言模型用之作答
生成 (Generation)
大语言模型以所搜信息及原本之问,作答
질문 → 검색 → 보강 → 생성 → 답변
2. 分段策略
2.1 意义分段 (Semantic Chunking)
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
def semantic_chunking(text, model, threshold=0.7):
# 텍스트를 문장 단위로 분할
sentences = text.split('. ')
sentence_embeddings = model.encode(sentences)
# 클러스터링을 통한 의미적 청킹
kmeans = KMeans(n_clusters=min(len(sentences), 10))
kmeans.fit(sentence_embeddings)
# 같은 클러스터 내 문장들을 하나의 청크로 결합
chunks = []
for i in range(len(sentences)):
cluster = kmeans.labels_[i]
if cluster not in [c[0] for c in chunks]:
chunks.append((cluster, [sentences[i]]))
else:
for j, (c_cluster, c_sentences) in enumerate(chunks):
if c_cluster == cluster:
chunks[j] = (c_cluster, c_sentences + [sentences[i]])
break
return [chunk[1] for chunk in chunks]
# 사용 예시
model = SentenceTransformer('all-MiniLM-L6-v2')
text = "AI 기술은 빠르게 발전하고 있습니다. 인공지능은 많은 분야에서 활용됩니다. 특히 자연어 처리 분야에서는 큰 진전이 있습니다."
chunks = semantic_chunking(text, model)
2.2 递归分段 (Recursive Chunking)
import re
def recursive_chunking(text, chunk_size=512):
# 문단 단위 청킹
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
2.3 基于代理的摘要(代理摘要)
class AgenticChunker:
def __init__(self, model_name="gpt-3.5-turbo"):
self.model = model_name
def chunk_with_context(self, text, context_length=1000):
# 문맥을 고려한 청킹
sentences = re.split(r'[.!?]+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < context_length:
current_chunk += sentence + ". "
else:
if current_chunk.strip():
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
3. 嵌入模型之选
3.1 对比之表
| 模型 | 性能 | 速率 | 内存 | 用途 |
|---|---|---|---|---|
| all-MiniLM-L6-v2 | 86.4 | 迅捷 | 100MB | 常规 |
| all-mpnet-base-v2 | 88.2 | 中等 | 400MB | 精当 |
| BGE-M3 | 90.1 | 中等 | 500MB | 多语 |
| Sentence-BERT | 89.8 | 느림 | 700MB | 높은 정확도 |
3.2 성능 비교 코드
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
def evaluate_embedding_models():
models = {
'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
'BGE-M3': SentenceTransformer('BAAI/bge-m3')
}
test_sentences = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing deals with human language understanding."
]
results = {}
for name, model in models.items():
start_time = time.time()
embeddings = model.encode(test_sentences)
end_time = time.time()
# 유사도 계산
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
results[name] = {
'time': end_time - start_time,
'similarity': similarity,
'memory': model._modules['0'].state_dict()['weight'].numel() * 4 / (1024**2) # MB
}
return results
# 성능 평가
perf_results = evaluate_embedding_models()
for model, metrics in perf_results.items():
print(f"{model}: Time={metrics['time']:.3f}s, Similarity={metrics['similarity']:.3f}")
4. 벡터 데이터베이스 비교
4.1 Chroma (가볍고 빠름)
import chromadb
from chromadb import Client
class ChromaVectorStore:
def __init__(self, collection_name="documents"):
self.client = Client()
self.collection = self.client.get_or_create_collection(collection_name)
def add_documents(self, documents, embeddings, metadatas=None):
self.collection.add(
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
def search(self, query_embedding, n_results=5):
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
return results
# 사용 예시
chroma_store = ChromaVectorStore()
# embeddings = model.encode(documents)
# chroma_store.add_documents(documents, embeddings)
4.2 Qdrant (고성능)
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
class QdrantVectorStore:
def __init__(self, host="localhost", port=6333, collection_name="documents"):
self.client = QdrantClient(host=host, port=port)
self.collection_name = collection_name
# 컬렉션 생성
self.client.recreate_collection(
collection_name=self.collection_name,
vectors_config={"size": 384, "distance": "Cosine"}
)
def add_documents(self, documents, embeddings, metadatas=None):
points = [
{
"id": i,
"vector": embedding,
"payload": {"text": doc, "metadata": metadata if metadata else {}}
}
for i, (doc, embedding, metadata) in enumerate(zip(documents, embeddings, metadatas or [None]*len(documents)))
]
self.client.upsert(
collection_name=self.collection_name,
points=points
)
def search(self, query_embedding, n_results=5):
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_embedding,
limit=n_results
)
return results
# 사용 예시
qdrant_store = QdrantVectorStore()
# qdrant_store.add_documents(documents, embeddings)
4.3 pgvector(基于SQL)
python
import psycopg2
from psycopg2.extras import Json
import numpy as np
class PGVectorStore:
def __init__(self, connection_string):
self.conn = psycopg2.connect(connection_string)
self.create_table()
def create_table(self):
with self.conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS document_embeddings (
id SERIAL PRIMARY KEY,
document TEXT,
embedding VECTOR(384),
metadata JSONB,
created_at TIMESTAMP DEFAULT NOW()
)
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON document_embeddings USING ivfflat (embedding vector_cosine_ops)")
self.conn.commit()
def add_documents(self,
---
📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)












