kcg-ai-monitoring/prediction/db/partition_manager.py
htlee 2ceeb966d8 feat(prediction): Phase 1-2 detection model registry + snapshot 관찰 보강
- models_core 패키지 신설 — BaseDetectionModel / ModelContext / ModelResult
  + Registry (ACTIVE 버전 인스턴스화, DAG 순환 검출, topo 플랜)
  + DAGExecutor (PRIMARY→ctx.shared 주입, SHADOW persist-only 오염 차단)
  + params_loader (5분 TTL 캐시), feature_flag (PREDICTION_USE_MODEL_REGISTRY)
- V034 스키마 정합성 사전 검증 + silent error 3건 선제 방어
  · model_id VARCHAR(64) 초과 시 __init__ 에서 즉시 ValueError
  · metric_key VARCHAR(64) 초과는 경고 후 drop (다른 metric 는 저장)
  · persist 가 ctx.conn 재사용 (pool maxconn=5 고갈 방지)
- scheduler.py — 10단계 feature flag 분기 (기본 0, 구 경로 보존)
- partition_manager — detection_model_run_outputs 월별 파티션 자동 생성/DROP
- 유닛테스트 15 케이스 전체 통과 (DAG 순환, SHADOW 오염 차단, 길이 검증)
- snapshot 스크립트 (hourly/diagnostic) 개선
  · spoofing gt0/gt03/gt05/gt07 세분화 — 'silent fault' vs 'no signal' 구분
  · V030 gear_identity_collisions 원시 섹션 (CRITICAL 51건 OPEN 포착)
  · V034 detection_model_* 모니터링 섹션 (Phase 2 대비)
  · stage timing 집계 + stats_hourly vs events category drift 감시

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 08:07:29 +09:00

251 lines
9.2 KiB
Python

"""gear_correlation_raw_metrics 파티션 유지보수.
APScheduler 일별 작업으로 실행:
- system_config에서 설정 읽기 (hot-reload, 프로세스 재시작 불필요)
- 미래 파티션 미리 생성
- 만료 파티션 DROP
- 미관측 점수 레코드 정리
"""
import logging
from datetime import date, datetime, timedelta
from config import qualified_table, settings
logger = logging.getLogger(__name__)
SYSTEM_CONFIG = qualified_table('system_config')
GEAR_CORRELATION_RAW_METRICS = qualified_table('gear_correlation_raw_metrics')
GEAR_CORRELATION_SCORES = qualified_table('gear_correlation_scores')
DETECTION_MODEL_RUN_OUTPUTS = qualified_table('detection_model_run_outputs')
def _get_config_int(conn, key: str, default: int) -> int:
"""system_config에서 설정값 조회. 없으면 default."""
cur = conn.cursor()
try:
cur.execute(
f"SELECT value::text FROM {SYSTEM_CONFIG} WHERE key = %s",
(key,),
)
row = cur.fetchone()
return int(row[0].strip('"')) if row else default
except Exception:
return default
finally:
cur.close()
def _create_future_partitions(conn, days_ahead: int) -> int:
"""미래 N일 파티션 생성. 반환: 생성된 파티션 수."""
cur = conn.cursor()
created = 0
try:
for i in range(days_ahead + 1):
d = date.today() + timedelta(days=i)
partition_name = f'gear_correlation_raw_metrics_{d.strftime("%Y%m%d")}'
cur.execute(
"SELECT 1 FROM pg_class c "
"JOIN pg_namespace n ON n.oid = c.relnamespace "
"WHERE c.relname = %s AND n.nspname = %s",
(partition_name, settings.KCGDB_SCHEMA),
)
if cur.fetchone() is None:
next_d = d + timedelta(days=1)
cur.execute(
f"CREATE TABLE IF NOT EXISTS {qualified_table(partition_name)} "
f"PARTITION OF {GEAR_CORRELATION_RAW_METRICS} "
f"FOR VALUES FROM ('{d.isoformat()}') TO ('{next_d.isoformat()}')"
)
created += 1
logger.info('created partition: %s.%s', settings.KCGDB_SCHEMA, partition_name)
conn.commit()
except Exception as e:
conn.rollback()
logger.error('failed to create partitions: %s', e)
finally:
cur.close()
return created
def _drop_expired_partitions(conn, retention_days: int) -> int:
"""retention_days 초과 파티션 DROP. 반환: 삭제된 파티션 수."""
cutoff = date.today() - timedelta(days=retention_days)
cur = conn.cursor()
dropped = 0
try:
cur.execute(
"SELECT c.relname FROM pg_class c "
"JOIN pg_namespace n ON n.oid = c.relnamespace "
"WHERE c.relname LIKE 'gear_correlation_raw_metrics_%%' "
"AND n.nspname = %s AND c.relkind = 'r'",
(settings.KCGDB_SCHEMA,),
)
for (name,) in cur.fetchall():
date_str = name.rsplit('_', 1)[-1]
try:
partition_date = datetime.strptime(date_str, '%Y%m%d').date()
except ValueError:
continue
if partition_date < cutoff:
cur.execute(f'DROP TABLE IF EXISTS {qualified_table(name)}')
dropped += 1
logger.info('dropped expired partition: %s.%s', settings.KCGDB_SCHEMA, name)
conn.commit()
except Exception as e:
conn.rollback()
logger.error('failed to drop partitions: %s', e)
finally:
cur.close()
return dropped
def _create_future_monthly_detection_partitions(conn, months_ahead: int) -> int:
"""detection_model_run_outputs 미래 N개월 파티션 생성.
월별 RANGE 파티션 (cycle_started_at) — V034 에서 2026-04/05 가 Flyway 로 선생성.
이후는 이 함수가 매일 돌면서 `months_ahead` 만큼 미리 생성.
Returns:
생성된 파티션 수
"""
cur = conn.cursor()
created = 0
try:
anchor = date.today().replace(day=1)
for i in range(months_ahead + 1):
# anchor 기준 +i 개월
y = anchor.year + (anchor.month - 1 + i) // 12
m = (anchor.month - 1 + i) % 12 + 1
start = date(y, m, 1)
ny = y + (1 if m == 12 else 0)
nm = 1 if m == 12 else m + 1
end = date(ny, nm, 1)
partition_name = f'detection_model_run_outputs_{y:04d}_{m:02d}'
cur.execute(
"SELECT 1 FROM pg_class c "
"JOIN pg_namespace n ON n.oid = c.relnamespace "
"WHERE c.relname = %s AND n.nspname = %s",
(partition_name, settings.KCGDB_SCHEMA),
)
if cur.fetchone() is None:
cur.execute(
f"CREATE TABLE IF NOT EXISTS {qualified_table(partition_name)} "
f"PARTITION OF {DETECTION_MODEL_RUN_OUTPUTS} "
f"FOR VALUES FROM ('{start.isoformat()}') TO ('{end.isoformat()}')"
)
created += 1
logger.info(
'created partition: %s.%s', settings.KCGDB_SCHEMA, partition_name,
)
conn.commit()
except Exception as e:
conn.rollback()
logger.error('failed to create detection_model_run_outputs partitions: %s', e)
finally:
cur.close()
return created
def _drop_expired_monthly_detection_partitions(conn, retention_months: int) -> int:
"""detection_model_run_outputs retention_months 초과 월 파티션 DROP.
SHADOW 원시 결과는 비교 분석 후 가치 낮음 — 기본 retention 은 1개월.
집계는 detection_model_metrics 에 보존되므로 원시 폐기해도 추적 가능.
"""
cutoff_anchor = date.today().replace(day=1)
# retention_months 만큼 과거로 이동
y = cutoff_anchor.year
m = cutoff_anchor.month - retention_months
while m <= 0:
m += 12
y -= 1
cutoff = date(y, m, 1)
cur = conn.cursor()
dropped = 0
try:
cur.execute(
"SELECT c.relname FROM pg_class c "
"JOIN pg_namespace n ON n.oid = c.relnamespace "
"WHERE c.relname LIKE 'detection_model_run_outputs_%%' "
"AND n.nspname = %s AND c.relkind = 'r'",
(settings.KCGDB_SCHEMA,),
)
for (name,) in cur.fetchall():
tail = name[len('detection_model_run_outputs_'):]
try:
yy, mm = tail.split('_')
partition_start = date(int(yy), int(mm), 1)
except (ValueError, IndexError):
continue
if partition_start < cutoff:
cur.execute(f'DROP TABLE IF EXISTS {qualified_table(name)}')
dropped += 1
logger.info(
'dropped expired partition: %s.%s', settings.KCGDB_SCHEMA, name,
)
conn.commit()
except Exception as e:
conn.rollback()
logger.error('failed to drop detection_model_run_outputs partitions: %s', e)
finally:
cur.close()
return dropped
def _cleanup_stale_scores(conn, cleanup_days: int) -> int:
"""cleanup_days 이상 미관측 점수 레코드 삭제."""
cur = conn.cursor()
try:
cur.execute(
f"DELETE FROM {GEAR_CORRELATION_SCORES} "
"WHERE last_observed_at < NOW() - make_interval(days => %s)",
(cleanup_days,),
)
deleted = cur.rowcount
conn.commit()
return deleted
except Exception as e:
conn.rollback()
logger.error('failed to cleanup stale scores: %s', e)
return 0
finally:
cur.close()
def maintain_partitions():
"""일별 파티션 유지보수 — 스케줄러에서 호출.
system_config에서 설정을 매번 읽으므로
API를 통한 설정 변경이 다음 실행 시 즉시 반영됨.
"""
from db import kcgdb
with kcgdb.get_conn() as conn:
retention = _get_config_int(conn, 'partition.raw_metrics.retention_days', 7)
ahead = _get_config_int(conn, 'partition.raw_metrics.create_ahead_days', 3)
cleanup_days = _get_config_int(conn, 'partition.scores.cleanup_days', 30)
det_months_ahead = _get_config_int(
conn, 'partition.detection_model_run_outputs.create_ahead_months', 2,
)
det_retention_months = _get_config_int(
conn, 'partition.detection_model_run_outputs.retention_months', 1,
)
created = _create_future_partitions(conn, ahead)
dropped = _drop_expired_partitions(conn, retention)
cleaned = _cleanup_stale_scores(conn, cleanup_days)
det_created = _create_future_monthly_detection_partitions(conn, det_months_ahead)
det_dropped = _drop_expired_monthly_detection_partitions(conn, det_retention_months)
logger.info(
'partition maintenance: %d created, %d dropped, %d stale scores cleaned '
'(retention=%dd, ahead=%dd, cleanup=%dd); '
'detection_model_run_outputs: %d created, %d dropped '
'(retention_months=%d, ahead_months=%d)',
created, dropped, cleaned, retention, ahead, cleanup_days,
det_created, det_dropped, det_retention_months, det_months_ahead,
)