- prediction/: FastAPI 7단계 분류 파이프라인 + 6개 탐지 알고리즘 - snpdb 궤적 조회 → 인메모리 캐시(13K척) → 분류 → kcgdb 저장 - APScheduler 5분 주기, Python 3.9 호환 - 버그 수정: @property last_bucket, SQL INTERVAL 바인딩, rollback, None 가드 - 보안: DB 비밀번호 하드코딩 제거 → env 환경변수 필수 - deploy/kcg-prediction.service: systemd 서비스 (redis-211, 포트 8001) - deploy.yml: prediction CI/CD 배포 단계 추가 (192.168.1.18:32023) - backend: PredictionProxyController (health/status/trigger 프록시) - backend: AppProperties predictionBaseUrl + AuthFilter 인증 예외 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
102 lines
3.5 KiB
Python
102 lines
3.5 KiB
Python
from collections import Counter
|
|
from typing import Dict, Optional
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from pipeline.constants import BIRCH_THRESHOLD, BIRCH_BRANCHING, MIN_CLUSTER_SIZE
|
|
|
|
|
|
class EnhancedBIRCHClusterer:
|
|
"""Trajectory clustering using sklearn Birch with a simple K-means fallback.
|
|
|
|
Based on the enhanced-BIRCH approach (Yan, Yang et al.):
|
|
1. Resample each trajectory to a fixed-length vector.
|
|
2. Build a BIRCH CF-tree for memory-efficient hierarchical clustering.
|
|
3. Small clusters (< MIN_CLUSTER_SIZE) are relabelled as noise (-1).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
threshold: float = BIRCH_THRESHOLD,
|
|
branching: int = BIRCH_BRANCHING,
|
|
n_clusters: Optional[int] = None,
|
|
) -> None:
|
|
self.threshold = threshold
|
|
self.branching = branching
|
|
self.n_clusters = n_clusters
|
|
self._model = None
|
|
|
|
def _traj_to_vector(self, df_vessel: pd.DataFrame, n_points: int = 20) -> np.ndarray:
|
|
"""Convert a vessel trajectory DataFrame to a fixed-length vector.
|
|
|
|
Linearly samples n_points from the trajectory and interleaves lat/lon
|
|
values, then normalises to zero mean / unit variance.
|
|
"""
|
|
lats = df_vessel['lat'].values
|
|
lons = df_vessel['lon'].values
|
|
idx = np.linspace(0, len(lats) - 1, n_points).astype(int)
|
|
vec = np.concatenate([lats[idx], lons[idx]])
|
|
vec = (vec - vec.mean()) / (vec.std() + 1e-9)
|
|
return vec
|
|
|
|
def fit_predict(self, vessels: Dict[str, pd.DataFrame]) -> Dict[str, int]:
|
|
"""Cluster vessel trajectories.
|
|
|
|
Args:
|
|
vessels: mapping of mmsi -> resampled trajectory DataFrame.
|
|
|
|
Returns:
|
|
Mapping of mmsi -> cluster_id. Vessels in small clusters are
|
|
assigned cluster_id -1 (noise). Vessels with fewer than 20
|
|
points are excluded from the result.
|
|
"""
|
|
mmsi_list: list[str] = []
|
|
vectors: list[np.ndarray] = []
|
|
|
|
for mmsi, df_v in vessels.items():
|
|
if len(df_v) < 20:
|
|
continue
|
|
mmsi_list.append(mmsi)
|
|
vectors.append(self._traj_to_vector(df_v))
|
|
|
|
if len(vectors) < 3:
|
|
return {m: 0 for m in mmsi_list}
|
|
|
|
X = np.array(vectors)
|
|
|
|
try:
|
|
from sklearn.cluster import Birch
|
|
model = Birch(
|
|
threshold=self.threshold,
|
|
branching_factor=self.branching,
|
|
n_clusters=self.n_clusters,
|
|
)
|
|
labels = model.fit_predict(X)
|
|
self._model = model
|
|
except ImportError:
|
|
labels = self._simple_cluster(X)
|
|
|
|
cnt = Counter(labels)
|
|
labels = np.array([lbl if cnt[lbl] >= MIN_CLUSTER_SIZE else -1 for lbl in labels])
|
|
|
|
return dict(zip(mmsi_list, labels.tolist()))
|
|
|
|
@staticmethod
|
|
def _simple_cluster(X: np.ndarray, k: int = 5) -> np.ndarray:
|
|
"""Fallback K-means used when sklearn is unavailable."""
|
|
n = len(X)
|
|
k = min(k, n)
|
|
centers = X[np.random.choice(n, k, replace=False)]
|
|
labels = np.zeros(n, dtype=int)
|
|
for _ in range(20):
|
|
dists = np.array([[np.linalg.norm(x - c) for c in centers] for x in X])
|
|
labels = dists.argmin(axis=1)
|
|
new_centers = np.array(
|
|
[X[labels == i].mean(axis=0) if (labels == i).any() else centers[i] for i in range(k)]
|
|
)
|
|
if np.allclose(centers, new_centers, atol=1e-6):
|
|
break
|
|
centers = new_centers
|
|
return labels
|