RAG 시스템 실전 구축 (v3)

개요

이 가이드는 실전에서 RAG(Retrieval-Augmented Generation) 시스템을 구축하는 데 필요한 모든 것을 다룹니다. ML 엔지니어와 백엔드 개발자들이 데이터를 기반으로 의미 있는 질문에 답변할 수 있는 시스템을 구축할 때 실용적인 코드와 접근 방식을 제공합니다.

1. RAG 기본 개념

RAG는 검색 기반 생성 시스템으로, 두 가지 주요 단계를 거칩니다:

검색 (Retrieval)

문서에서 관련 정보를 찾는 과정

보강 (Augmentation)

검색된 정보를 프롬프트에 추가하여 LLM이 답변 생성에 활용

생성 (Generation)

LLM이 검색된 정보와 원본 질문을 기반으로 답변 생성

질문 → 검색 → 보강 → 생성 → 답변

2. 청킹 전략

2.1 의미적 청킹 (Semantic Chunking)

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

def semantic_chunking(text, model, threshold=0.7):
    # 텍스트를 문장 단위로 분할
    sentences = text.split('. ')
    sentence_embeddings = model.encode(sentences)

    # 클러스터링을 통한 의미적 청킹
    kmeans = KMeans(n_clusters=min(len(sentences), 10))
    kmeans.fit(sentence_embeddings)

    # 같은 클러스터 내 문장들을 하나의 청크로 결합
    chunks = []
    for i in range(len(sentences)):
        cluster = kmeans.labels_[i]
        if cluster not in [c[0] for c in chunks]:
            chunks.append((cluster, [sentences[i]]))
        else:
            for j, (c_cluster, c_sentences) in enumerate(chunks):
                if c_cluster == cluster:
                    chunks[j] = (c_cluster, c_sentences + [sentences[i]])
                    break

    return [chunk[1] for chunk in chunks]

# 사용 예시
model = SentenceTransformer('all-MiniLM-L6-v2')
text = "AI 기술은 빠르게 발전하고 있습니다. 인공지능은 많은 분야에서 활용됩니다. 특히 자연어 처리 분야에서는 큰 진전이 있습니다."
chunks = semantic_chunking(text, model)

2.2 재귀적 청킹 (Recursive Chunking)

import re

def recursive_chunking(text, chunk_size=512):
    # 문단 단위 청킹
    paragraphs = text.split('\n\n')
    chunks = []

    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < chunk_size:
            current_chunk += para + "\n\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = para + "\n\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

2.3 에이전트 기반 청킹 (Agentic Chunking)

class AgenticChunker:
    def __init__(self, model_name="gpt-3.5-turbo"):
        self.model = model_name

    def chunk_with_context(self, text, context_length=1000):
        # 문맥을 고려한 청킹
        sentences = re.split(r'[.!?]+', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < context_length:
                current_chunk += sentence + ". "
            else:
                if current_chunk.strip():
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        if current_chunk.strip():
            chunks.append(current_chunk.strip())

        return chunks

3. 임베딩 모델 선택

3.1 비교 테이블

모델	성능	속도	메모리	용도
all-MiniLM-L6-v2	86.4	빠름	100MB	일반
all-mpnet-base-v2	88.2	중간	400MB	정확도
BGE-M3	90.1	중간	500MB	다국어
Sentence-BERT	89.8	느림	700MB	높은 정확도

3.2 성능 비교 코드

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time

def evaluate_embedding_models():
    models = {
        'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),
        'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),
        'BGE-M3': SentenceTransformer('BAAI/bge-m3')
    }

    test_sentences = [
        "Machine learning is a subset of artificial intelligence.",
        "Deep learning uses neural networks with multiple layers.",
        "Natural language processing deals with human language understanding."
    ]

    results = {}
    for name, model in models.items():
        start_time = time.time()
        embeddings = model.encode(test_sentences)
        end_time = time.time()

        # 유사도 계산
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

        results[name] = {
            'time': end_time - start_time,
            'similarity': similarity,
            'memory': model._modules['0'].state_dict()['weight'].numel() * 4 / (1024**2)  # MB
        }

    return results

# 성능 평가
perf_results = evaluate_embedding_models()
for model, metrics in perf_results.items():
    print(f"{model}: Time={metrics['time']:.3f}s, Similarity={metrics['similarity']:.3f}")

4. 벡터 데이터베이스 비교

4.1 Chroma (가볍고 빠름)

import chromadb
from chromadb import Client

class ChromaVectorStore:
    def __init__(self, collection_name="documents"):
        self.client = Client()
        self.collection = self.client.get_or_create_collection(collection_name)

    def add_documents(self, documents, embeddings, metadatas=None):
        self.collection.add(
            documents=documents,
            embeddings=embeddings,
            metadatas=metadatas
        )

    def search(self, query_embedding, n_results=5):
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        return results

# 사용 예시
chroma_store = ChromaVectorStore()
# embeddings = model.encode(documents)
# chroma_store.add_documents(documents, embeddings)

4.2 Qdrant (고성능)

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

class QdrantVectorStore:
    def __init__(self, host="localhost", port=6333, collection_name="documents"):
        self.client = QdrantClient(host=host, port=port)
        self.collection_name = collection_name

        # 컬렉션 생성
        self.client.recreate_collection(
            collection_name=self.collection_name,
            vectors_config={"size": 384, "distance": "Cosine"}
        )

    def add_documents(self, documents, embeddings, metadatas=None):
        points = [
            {
                "id": i,
                "vector": embedding,
                "payload": {"text": doc, "metadata": metadata if metadata else {}}
            }
            for i, (doc, embedding, metadata) in enumerate(zip(documents, embeddings, metadatas or [None]*len(documents)))
        ]
        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )

    def search(self, query_embedding, n_results=5):
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=n_results
        )
        return results

# 사용 예시
qdrant_store = QdrantVectorStore()
# qdrant_store.add_documents(documents, embeddings)

4.3 pgvector (SQL 기반)


python
import psycopg2
from psycopg2.extras import Json
import numpy as np

class PGVectorStore:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        self.create_table()

    def create_table(self):
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS document_embeddings (
                    id SERIAL PRIMARY KEY,
                    document TEXT,
                    embedding VECTOR(384),
                    metadata JSONB,
                    created_at TIMESTAMP DEFAULT NOW()
                )
            """)
            cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON document_embeddings USING ivfflat (embedding vector_cosine_ops)")
        self.conn.commit()

    def add_documents(self,

---

📥 **Get the full guide on Gumroad**: https://gumroad.com/l/auto ($7)

推荐订阅源

DEV Community