- prediction/: FastAPI 7단계 분류 파이프라인 + 6개 탐지 알고리즘 - snpdb 궤적 조회 → 인메모리 캐시(13K척) → 분류 → kcgdb 저장 - APScheduler 5분 주기, Python 3.9 호환 - 버그 수정: @property last_bucket, SQL INTERVAL 바인딩, rollback, None 가드 - 보안: DB 비밀번호 하드코딩 제거 → env 환경변수 필수 - deploy/kcg-prediction.service: systemd 서비스 (redis-211, 포트 8001) - deploy.yml: prediction CI/CD 배포 단계 추가 (192.168.1.18:32023) - backend: PredictionProxyController (health/status/trigger 프록시) - backend: AppProperties predictionBaseUrl + AuthFilter 인증 예외 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import pandas as pd
|
|
from collections import defaultdict
|
|
|
|
from pipeline.constants import KR_BOUNDS, MAX_SOG_KNOTS, MIN_TRAJ_POINTS
|
|
|
|
|
|
class AISPreprocessor:
|
|
"""Delete-Supplement-Update (Yan et al. 2022)"""
|
|
|
|
def __init__(self):
|
|
self.stats = defaultdict(int)
|
|
|
|
def run(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
original = len(df)
|
|
|
|
required = ['mmsi', 'timestamp', 'lat', 'lon', 'sog', 'cog']
|
|
missing = [c for c in required if c not in df.columns]
|
|
if missing:
|
|
raise ValueError(f"필수 컬럼 누락: {missing}")
|
|
|
|
df = df.copy()
|
|
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
|
|
|
valid_mmsi = df['mmsi'].astype(str).str.match(r'^\d{9}$')
|
|
df = df[valid_mmsi]
|
|
self.stats['invalid_mmsi'] += original - len(df)
|
|
|
|
df = df[(df['lat'].between(-90, 90)) & (df['lon'].between(-180, 180))]
|
|
|
|
df = df[
|
|
df['lat'].between(KR_BOUNDS['lat_min'], KR_BOUNDS['lat_max']) &
|
|
df['lon'].between(KR_BOUNDS['lon_min'], KR_BOUNDS['lon_max'])
|
|
]
|
|
|
|
df = df.sort_values(['mmsi', 'timestamp'])
|
|
df['sog'] = df.groupby('mmsi')['sog'].transform(
|
|
lambda x: x.where(
|
|
x.between(0, MAX_SOG_KNOTS),
|
|
x.rolling(3, center=True, min_periods=1).mean(),
|
|
)
|
|
)
|
|
df = df[(df['sog'] >= 0) & (df['sog'] <= MAX_SOG_KNOTS)]
|
|
|
|
counts = df.groupby('mmsi').size()
|
|
valid_mmsi_list = counts[counts >= MIN_TRAJ_POINTS].index
|
|
df = df[df['mmsi'].isin(valid_mmsi_list)]
|
|
|
|
df = df.drop_duplicates(subset=['mmsi', 'timestamp'])
|
|
|
|
self.stats['final_records'] = len(df)
|
|
self.stats['retention_pct'] = round(len(df) / max(original, 1) * 100, 2)
|
|
return df.reset_index(drop=True)
|