kcg-ai-monitoring/prediction/scripts/hourly-analysis-snapshot.sh
htlee 2ceeb966d8 feat(prediction): Phase 1-2 detection model registry + snapshot 관찰 보강
- models_core 패키지 신설 — BaseDetectionModel / ModelContext / ModelResult
  + Registry (ACTIVE 버전 인스턴스화, DAG 순환 검출, topo 플랜)
  + DAGExecutor (PRIMARY→ctx.shared 주입, SHADOW persist-only 오염 차단)
  + params_loader (5분 TTL 캐시), feature_flag (PREDICTION_USE_MODEL_REGISTRY)
- V034 스키마 정합성 사전 검증 + silent error 3건 선제 방어
  · model_id VARCHAR(64) 초과 시 __init__ 에서 즉시 ValueError
  · metric_key VARCHAR(64) 초과는 경고 후 drop (다른 metric 는 저장)
  · persist 가 ctx.conn 재사용 (pool maxconn=5 고갈 방지)
- scheduler.py — 10단계 feature flag 분기 (기본 0, 구 경로 보존)
- partition_manager — detection_model_run_outputs 월별 파티션 자동 생성/DROP
- 유닛테스트 15 케이스 전체 통과 (DAG 순환, SHADOW 오염 차단, 길이 검증)
- snapshot 스크립트 (hourly/diagnostic) 개선
  · spoofing gt0/gt03/gt05/gt07 세분화 — 'silent fault' vs 'no signal' 구분
  · V030 gear_identity_collisions 원시 섹션 (CRITICAL 51건 OPEN 포착)
  · V034 detection_model_* 모니터링 섹션 (Phase 2 대비)
  · stage timing 집계 + stats_hourly vs events category drift 감시

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 08:07:29 +09:00

488 lines
19 KiB
Bash
Executable File

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# prediction 시간당 상태 스냅샷 수집기 (DAR-03 G코드 + 어구 위반 포함)
# 실행 환경: redis-211 서버 (prediction 서비스 호스트)
# cron: 0 * * * * /home/apps/kcg-ai-prediction/scripts/hourly-analysis-snapshot.sh
#
# 출력: /home/apps/kcg-ai-prediction/data/hourly-analysis/YYYYMMDD-HHMM.txt
set -u
OUTDIR=/home/apps/kcg-ai-prediction/data/hourly-analysis
mkdir -p "$OUTDIR"
STAMP=$(date '+%Y%m%d-%H%M')
OUT="$OUTDIR/$STAMP.txt"
export PGPASSWORD=Kcg2026ai
PSQL="psql -U kcg-app -d kcgaidb -h 211.208.115.83 -P pager=off"
{
echo "# prediction hourly snapshot (DAR-03 enhanced)"
echo "# generated: $(date '+%Y-%m-%d %H:%M:%S %Z')"
echo "# host: $(hostname)"
echo ""
$PSQL << 'SQL'
\echo === 1. VESSEL_ANALYSIS overview (last 1h) ===
SELECT count(*) total,
count(*) FILTER (WHERE vessel_type != 'UNKNOWN') pipeline_path,
count(*) FILTER (WHERE vessel_type = 'UNKNOWN') lightweight_path,
count(*) FILTER (WHERE is_dark) dark,
count(*) FILTER (WHERE spoofing_score > 0.5) spoof_hi,
count(*) FILTER (WHERE transship_suspect) transship,
count(*) FILTER (WHERE gear_judgment IS NOT NULL AND gear_judgment != '') gear_violation,
count(*) FILTER (WHERE risk_level='CRITICAL') crit_lvl,
count(*) FILTER (WHERE risk_level='HIGH') high_lvl,
max(risk_score) max_risk,
round(avg(risk_score)::numeric, 2) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour';
\echo
\echo === 1a. SPOOFING signal health (silent-vs-fault 구분) ===
-- spoof_hi=0 이 "고장"인지 "신호 없음"인지 구분하려면 gt0 / gt03 / gt05 / max 를 모두 본다.
-- gt0 가 0 이면 파이프라인이 spoofing_score 를 아예 계산하지 못하고 있다는 신호 (원인 추적 필요).
-- gt0>0 인데 gt05=0 이면 알고리즘은 동작 중이나 threshold 돌파 대상이 없다 (정상일 수 있음).
SELECT count(*) total,
count(*) FILTER (WHERE spoofing_score > 0) gt0,
count(*) FILTER (WHERE spoofing_score > 0.3) gt03,
count(*) FILTER (WHERE spoofing_score > 0.5) gt05,
count(*) FILTER (WHERE spoofing_score > 0.7) gt07,
round(avg(spoofing_score)::numeric, 4) avg_score,
round(max(spoofing_score)::numeric, 4) max_score
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour';
\echo
\echo === 2. ZONE x DARK x GEAR_VIOLATION distribution ===
SELECT zone_code,
count(*) total,
count(*) FILTER (WHERE is_dark) dark,
count(*) FILTER (WHERE transship_suspect) transship,
count(*) FILTER (WHERE gear_judgment IS NOT NULL AND gear_judgment != '') gear_viol,
count(*) FILTER (WHERE risk_score >= 70) crit,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY zone_code ORDER BY total DESC;
\echo
\echo === 3. DARK GAP distribution ===
SELECT CASE
WHEN gap_duration_min < 30 THEN 'a_lt30'
WHEN gap_duration_min < 60 THEN 'b_30-59'
WHEN gap_duration_min < 120 THEN 'c_60-119'
WHEN gap_duration_min < 360 THEN 'd_120-359'
WHEN gap_duration_min < 1440 THEN 'e_360-1439'
ELSE 'f_gte1440' END gap_bucket,
count(*) total,
count(*) FILTER (WHERE is_dark) dark,
count(*) FILTER (WHERE is_dark AND vessel_type='UNKNOWN') dark_lw,
count(*) FILTER (WHERE is_dark AND vessel_type!='UNKNOWN') dark_pipeline
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY gap_bucket ORDER BY gap_bucket;
\echo
\echo === 4. DARK by activity_state ===
SELECT activity_state, count(*), round(avg(gap_duration_min)::numeric, 0) avg_gap
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour' AND is_dark
GROUP BY activity_state ORDER BY count DESC;
\echo
\echo === 5. DARK sample top 20 by gap ===
SELECT mmsi, zone_code, activity_state, gap_duration_min, risk_score
FROM (
SELECT DISTINCT ON (mmsi) mmsi, zone_code, activity_state, gap_duration_min,
risk_score, analyzed_at
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour' AND is_dark
ORDER BY mmsi, analyzed_at DESC
) latest ORDER BY gap_duration_min DESC LIMIT 20;
\echo
\echo === 6. EVENTS last 1h by category x level ===
SELECT category, level, count(*) cnt
FROM kcg.prediction_events
WHERE created_at > now() - interval '1 hour'
GROUP BY category, level ORDER BY cnt DESC;
\echo
\echo === 7. STATS_HOURLY latest 3 rows ===
SELECT stat_hour, total_detections, event_count, critical_count,
by_category::text, by_zone::text
FROM kcg.prediction_stats_hourly
ORDER BY stat_hour DESC LIMIT 3;
\echo
\echo === 8. KPI REALTIME ===
SELECT kpi_key, value, trend, delta_pct, updated_at
FROM kcg.prediction_kpi_realtime ORDER BY kpi_key;
\echo
\echo === 9. RISK_SCORE histogram ===
SELECT CASE
WHEN risk_score < 10 THEN 'a_0-9'
WHEN risk_score < 30 THEN 'b_10-29'
WHEN risk_score < 50 THEN 'c_30-49'
WHEN risk_score < 70 THEN 'd_50-69'
WHEN risk_score < 90 THEN 'e_70-89'
ELSE 'f_90-100' END bucket,
count(*) cnt,
count(*) FILTER (WHERE vessel_type='UNKNOWN') lightweight
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY bucket ORDER BY bucket;
\echo
\echo === 10. TRANSSHIP + SPOOF + FLEET ===
SELECT
count(*) FILTER (WHERE transship_suspect) transship_ct,
count(*) FILTER (WHERE spoofing_score > 0.7) spoof_gt070,
count(*) FILTER (WHERE speed_jump_count > 0) speed_jumps,
count(*) FILTER (WHERE fleet_is_leader) fleet_leader,
count(DISTINCT fleet_cluster_id) FILTER (WHERE fleet_cluster_id > 0) fleet_clusters
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour';
\echo
\echo ===================================================================
\echo === FISHERY PERMIT CN REGISTRY (V029 - 한중어업협정)
\echo ===================================================================
\echo
\echo === P1. fishery_permit_cn year-by-year ===
SELECT permit_year, count(*) permits,
count(DISTINCT fishery_code) codes,
count(DISTINCT applicant_cn) applicants,
max(loaded_at) loaded_at
FROM kcg.fishery_permit_cn
GROUP BY permit_year ORDER BY permit_year DESC;
\echo
\echo === P2. fleet_vessels matching (current year = registry) ===
SELECT permit_year,
count(*) total,
count(mmsi) with_mmsi,
round(count(mmsi)::numeric / NULLIF(count(*),0) * 100, 1) match_pct,
max(last_seen_at) last_match
FROM kcg.fleet_vessels
WHERE permit_year IS NOT NULL
GROUP BY permit_year ORDER BY permit_year DESC;
\echo
\echo === P3. fleet_vessels breakdown by fishery_code (current year) ===
SELECT fishery_code, count(*) total, count(mmsi) matched,
round(count(mmsi)::numeric / NULLIF(count(*),0) * 100, 1) pct
FROM kcg.fleet_vessels
WHERE permit_year = EXTRACT(YEAR FROM now())::int
GROUP BY fishery_code ORDER BY total DESC;
\echo
\echo === P3.5 match_method distribution (NAME_EXACT vs NAME_FUZZY) ===
SELECT coalesce(match_method, '(unmatched)') method,
count(*) cnt,
round(avg(match_confidence)::numeric, 3) avg_conf
FROM kcg.fleet_vessels
WHERE permit_year = EXTRACT(YEAR FROM now())::int
GROUP BY method ORDER BY cnt DESC;
\echo
\echo === P3.6 fishery_code × match_method cross ===
SELECT fishery_code,
count(*) FILTER (WHERE match_method = 'NAME_EXACT') exact,
count(*) FILTER (WHERE match_method = 'NAME_FUZZY') fuzzy,
count(*) FILTER (WHERE mmsi IS NULL) unmatched,
count(*) total
FROM kcg.fleet_vessels
WHERE permit_year = EXTRACT(YEAR FROM now())::int
GROUP BY fishery_code ORDER BY total DESC;
\echo
\echo === P4. vessel_analysis_results.gear_code distribution (last 1h) ===
SELECT coalesce(gear_code, '(null)') gear_code,
count(*) cnt,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY gear_code ORDER BY cnt DESC LIMIT 15;
\echo
\echo === P5. fleet_role distribution (last 1h, from registry match) ===
SELECT fleet_role, count(*) cnt,
count(*) FILTER (WHERE fleet_is_leader) is_leader,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY fleet_role ORDER BY cnt DESC;
\echo
\echo === G1. PIPELINE vessel_type distribution ===
SELECT vessel_type, count(*),
round(avg(ucaf_score)::numeric, 3) avg_ucaf,
round(avg(ucft_score)::numeric, 3) avg_ucft,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY vessel_type ORDER BY count DESC;
\echo
\echo === G2. GEAR_GROUP_PARENT_RESOLUTION ===
SELECT status, count(*),
round(avg(confidence)::numeric, 3) avg_conf,
round(avg(top_score)::numeric, 3) avg_top,
round(avg(stable_cycles)::numeric, 1) avg_stable
FROM kcg.gear_group_parent_resolution
GROUP BY status ORDER BY count DESC;
\echo
\echo === G3. GEAR_CORRELATION_SCORES distribution ===
SELECT CASE
WHEN current_score < 0.3 THEN 'a_lt0.3'
WHEN current_score < 0.5 THEN 'b_0.3-0.5'
WHEN current_score < 0.7 THEN 'c_0.5-0.7'
WHEN current_score < 0.85 THEN 'd_0.7-0.85'
ELSE 'e_gte0.85' END bucket,
count(*),
count(DISTINCT group_key) uniq_groups,
count(DISTINCT target_mmsi) uniq_targets
FROM kcg.gear_correlation_scores
WHERE updated_at > now() - interval '1 hour'
GROUP BY bucket ORDER BY bucket;
\echo
\echo ===================================================================
\echo === DAR-03 G-CODE DIAGNOSTICS (last 1h)
\echo ===================================================================
\echo
\echo === D1. gear_judgment distribution ===
SELECT coalesce(NULLIF(gear_judgment, ''), '(none)') judgment,
count(*) cnt,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY judgment ORDER BY cnt DESC;
\echo
\echo === D2. G-code frequency ===
SELECT gcode, count(*) cnt,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results,
LATERAL jsonb_array_elements_text(features->'g_codes') AS gcode
WHERE analyzed_at > now() - interval '1 hour'
GROUP BY gcode ORDER BY cnt DESC;
\echo
\echo === D3. G-01 zone x gear cross-table (VIOLATION CHECK) ===
SELECT zone_code, vessel_type, count(*) total,
count(*) FILTER (WHERE features->>'g_codes' LIKE '%G-01%') g01_violation
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND vessel_type != 'UNKNOWN' AND zone_code LIKE 'ZONE_%'
GROUP BY zone_code, vessel_type ORDER BY zone_code, vessel_type;
\echo
\echo === D3.5 pair_type distribution (DAR-03 base-target 탐색) ===
SELECT coalesce(features->>'pair_type', '(none)') pair_type,
count(*) cnt,
round(avg((features->>'similarity')::numeric)::numeric, 3) avg_sim,
round(avg((features->>'confidence')::numeric)::numeric, 3) avg_conf
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND features->>'pair_trawl_detected' = 'true'
GROUP BY pair_type ORDER BY cnt DESC;
\echo
\echo === D3.6 pair_trawl tier distribution (signal-strength tier) ===
SELECT coalesce(features->>'pair_tier', '(none)') tier,
count(*) cnt,
round(avg((features->>'similarity')::numeric)::numeric, 3) avg_sim,
round(avg((features->'gear_violation_evidence'->'G-06'->>'sync_duration_min')::numeric)::numeric, 1) avg_sync_min,
round(avg((features->'gear_violation_evidence'->'G-06'->>'mean_separation_nm')::numeric)::numeric, 3) avg_sep_nm
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND features->>'pair_trawl_detected' = 'true'
GROUP BY tier ORDER BY cnt DESC;
\echo
\echo === D3.7 G-02 closed-season + G-03 unregistered-gear counts ===
SELECT
count(*) FILTER (WHERE features->>'g_codes' LIKE '%G-02%') g02_count,
count(*) FILTER (WHERE features->>'g_codes' LIKE '%G-03%') g03_count,
count(*) FILTER (WHERE features->>'gear_judgment' = 'CLOSED_SEASON_FISHING') judg_closed,
count(*) FILTER (WHERE features->>'gear_judgment' = 'UNREGISTERED_GEAR') judg_unreg
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour';
\echo
\echo === D3.8 pair detection reject breakdown (last 1h journal) ===
\! ssh redis-211 "sudo journalctl -u kcg-ai-prediction --no-pager --since '1 hour ago' | grep -oE 'pair detection:[^$]+reject=\{[^}]+\}' | awk -F'reject=' '{print $2}' | sort | uniq -c | sort -rn | head -10" 2>/dev/null || true
\echo
\echo === D4. G-06 pair trawl detections ===
SELECT mmsi, zone_code, vessel_type, risk_score,
(features->'gear_violation_evidence'->'G-06'->>'sync_duration_min') sync_min,
(features->'gear_violation_evidence'->'G-06'->>'mean_separation_nm') sep_nm,
(features->'gear_violation_evidence'->'G-06'->>'pair_mmsi') pair_mmsi,
features->>'pair_trawl_detected' pt
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND (features->>'pair_trawl_detected' = 'true' OR features->>'g_codes' LIKE '%G-06%')
ORDER BY risk_score DESC LIMIT 20;
\echo
\echo === D5. G-04 MMSI tampering + G-05 gear drift ===
SELECT mmsi, zone_code, vessel_type, risk_score,
features->>'g_codes' g_codes,
(features->'gear_violation_evidence'->'G-04'->>'cycling_count') g04_cycle,
(features->'gear_violation_evidence'->'G-05'->>'drift_nm') g05_drift
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND (features->>'g_codes' LIKE '%G-04%' OR features->>'g_codes' LIKE '%G-05%')
ORDER BY risk_score DESC LIMIT 20;
\echo
\echo === D6. GEAR_ILLEGAL events (last 1h) ===
SELECT category, level, title, count(*) cnt
FROM kcg.prediction_events
WHERE created_at > now() - interval '1 hour'
AND category IN ('GEAR_ILLEGAL', 'MMSI_TAMPERING')
GROUP BY category, level, title ORDER BY cnt DESC;
\echo
\echo === D7. violation_categories ILLEGAL_GEAR breakdown ===
SELECT count(*) total,
count(*) FILTER (WHERE gear_judgment = 'ZONE_VIOLATION') zone_viol,
count(*) FILTER (WHERE gear_judgment = 'PAIR_TRAWL') pair_trawl,
count(*) FILTER (WHERE gear_judgment = 'GEAR_MISMATCH') mismatch
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND 'ILLEGAL_GEAR' = ANY(violation_categories);
\echo
\echo === D8. gear_violation_score histogram (pipeline vessels) ===
SELECT CASE
WHEN (features->>'gear_violation_score')::int = 0 THEN 'a_0 (no violation)'
WHEN (features->>'gear_violation_score')::int <= 15 THEN 'b_1-15 (G-01 or G-04/G-05)'
WHEN (features->>'gear_violation_score')::int <= 25 THEN 'c_16-25 (G-06 or combo)'
ELSE 'd_gt25 (multiple G-codes)' END bucket,
count(*) cnt,
round(avg(risk_score)::numeric, 1) avg_risk
FROM kcg.vessel_analysis_results
WHERE analyzed_at > now() - interval '1 hour'
AND vessel_type != 'UNKNOWN'
AND features->>'gear_violation_score' IS NOT NULL
GROUP BY bucket ORDER BY bucket;
\echo
\echo === D9. EVENTS 24h hourly trend (with GEAR_ILLEGAL) ===
SELECT date_trunc('hour', occurred_at AT TIME ZONE 'Asia/Seoul') hr,
count(*) tot,
count(*) FILTER (WHERE category='DARK_VESSEL') dark,
count(*) FILTER (WHERE category='ILLEGAL_TRANSSHIP') transship,
count(*) FILTER (WHERE category='EEZ_INTRUSION') eez,
count(*) FILTER (WHERE category='GEAR_ILLEGAL') gear_illegal,
count(*) FILTER (WHERE category='HIGH_RISK_VESSEL') high_risk,
count(*) FILTER (WHERE category='GEAR_IDENTITY_COLLISION') gear_collide,
count(*) FILTER (WHERE level='CRITICAL') critical
FROM kcg.prediction_events
WHERE created_at > now() - interval '24 hours'
GROUP BY hr ORDER BY hr DESC LIMIT 25;
\echo
\echo ===================================================================
\echo === V030 GEAR_IDENTITY_COLLISIONS (원시 테이블 관찰)
\echo ===================================================================
\echo
\echo === V030-1. severity x status 분포 (24h) ===
SELECT severity, status, count(*) cnt,
max(last_seen_at) last_seen
FROM kcg.gear_identity_collisions
WHERE last_seen_at > now() - interval '24 hours'
GROUP BY severity, status ORDER BY cnt DESC;
\echo
\echo === V030-2. coexistence/swap 상위 20건 (24h) ===
SELECT name, mmsi_lo, mmsi_hi, severity, status,
coexistence_count coex, swap_count swap,
round(max_distance_km::numeric, 1) max_km
FROM kcg.gear_identity_collisions
WHERE last_seen_at > now() - interval '24 hours'
ORDER BY (coexistence_count + swap_count * 5) DESC LIMIT 20;
\echo
\echo ===================================================================
\echo === V034 DETECTION_MODEL REGISTRY (Phase 1-2)
\echo ===================================================================
\echo
\echo === V034-1. model catalog + enabled 여부 ===
SELECT count(*) catalog_total,
count(*) FILTER (WHERE is_enabled) enabled
FROM kcg.detection_models;
\echo
\echo === V034-2. version 상태 x role 분포 ===
SELECT status, coalesce(role,'(null)') role, count(*) cnt
FROM kcg.detection_model_versions
GROUP BY status, role ORDER BY status, role;
\echo
\echo === V034-3. detection_model_run_outputs 1h 적재 현황 (feature flag ON 시 증가) ===
SELECT model_id, role, count(*) rows,
min(cycle_started_at) oldest, max(cycle_started_at) newest
FROM kcg.detection_model_run_outputs
WHERE cycle_started_at > now() - interval '1 hour'
GROUP BY model_id, role ORDER BY rows DESC;
\echo
\echo === V034-4. detection_model_metrics 최신 5 모델 평균 소요 ===
SELECT model_id, role,
round(avg(metric_value) FILTER (WHERE metric_key='cycle_duration_ms')::numeric, 1) avg_ms,
round(avg(metric_value) FILTER (WHERE metric_key='output_count')::numeric, 1) avg_out
FROM kcg.detection_model_metrics
WHERE cycle_started_at > now() - interval '1 hour'
GROUP BY model_id, role ORDER BY model_id, role;
\echo
\echo === C1. stats_hourly vs raw events 카테고리 drift (event_generator silent drop 감시) ===
-- raw prediction_events 에는 있지만 stats_hourly.by_category 에는 없는 카테고리 (반대도 표시)
WITH recent_events AS (
SELECT DISTINCT category FROM kcg.prediction_events
WHERE created_at > now() - interval '2 hours'
),
stats_cats AS (
SELECT DISTINCT jsonb_object_keys(by_category) AS category
FROM kcg.prediction_stats_hourly
WHERE stat_hour > now() - interval '2 hours'
)
SELECT 'only_in_events' gap, category FROM recent_events
WHERE category NOT IN (SELECT category FROM stats_cats)
UNION ALL
SELECT 'only_in_stats', category FROM stats_cats
WHERE category NOT IN (SELECT category FROM recent_events);
SQL
echo ""
echo "=== 13. CYCLE LOG (last 65 min) ==="
# stage_runner, DAGExecutor, detection_model_registry, Traceback 까지 함께 추적
journalctl -u kcg-ai-prediction --since '65 minutes ago' --no-pager 2>/dev/null | \
grep -E 'lightweight|event_generator:|stats_aggregator hourly|kpi_writer:|analysis cycle:|pair_trawl|gear_violation|GEAR_ILLEGAL|stage [a-z_]+ (ok|failed)|DAGExecutor done|detection model registry|ERROR|Traceback' | \
tail -80
echo ""
echo "=== 14. STAGE TIMING (last 65 min, 소요시간 상위 10 + 실패 전체) ==="
# stage ok in X.XXs / stage failed after 를 수집하여 실패+장시간 스테이지 식별
journalctl -u kcg-ai-prediction --since '65 minutes ago' --no-pager 2>/dev/null | \
grep -oE 'stage [a-z_@.[:blank:][:digit:].-]+ (ok in [0-9.]+s|failed)' | \
awk '/failed/ {print "FAIL " $0; next}
/ok in/ {n=split($0,a," "); sec=a[n]; sub(/s$/,"",sec); printf "%8.2fs %s\n", sec, $0}' | \
sort -rn | awk 'NR<=10 || /^FAIL/' | head -40
echo ""
echo "=== END ==="
} > "$OUT" 2>&1
echo "[snapshot] saved: $OUT"