fix: time_bucket 수집 안전 윈도우 도입 — incremental fetch 데이터 누락 방지
snpdb 5분 버킷 데이터가 적재 완료까지 ~12분 소요되는데, 기존 fetch_incremental이 상한 없이 미완성 버킷을 수집하여 _last_bucket이 조기 전진 → 뒤늦게 완성된 행 영구 누락. - time_bucket.py 신규: safe_bucket(12분 지연) + backfill(3 bucket) - snpdb.py: fetch_all_tracks/fetch_incremental에 safe 상한 + 백필 하한 - vessel_store.py: merge_incremental sort+keep='last', evict_stale time_bucket 우선 - config.py: SNPDB_SAFE_DELAY_MIN=12, SNPDB_BACKFILL_BUCKETS=3 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
부모
67523b475d
커밋
2534c9dbca
@ -7,6 +7,7 @@
|
|||||||
### 수정
|
### 수정
|
||||||
- 1h 활성 판정을 parent_name 전체 합산 기준으로 변경 (서브클러스터 분리 후 개별 소수 문제 해결)
|
- 1h 활성 판정을 parent_name 전체 합산 기준으로 변경 (서브클러스터 분리 후 개별 소수 문제 해결)
|
||||||
- vessel_store의 _last_bucket 타임존 오류 수정 (tz-naive KST → UTC 잘못 변환 → incremental fetch 0건)
|
- vessel_store의 _last_bucket 타임존 오류 수정 (tz-naive KST → UTC 잘못 변환 → incremental fetch 0건)
|
||||||
|
- time_bucket 수집 안전 윈도우 도입 — safe_bucket(12분 지연) + 3 bucket 백필로 데이터 누락 방지
|
||||||
|
|
||||||
## [2026-04-01.2]
|
## [2026-04-01.2]
|
||||||
|
|
||||||
|
|||||||
25
prediction/cache/vessel_store.py
vendored
25
prediction/cache/vessel_store.py
vendored
@ -164,10 +164,11 @@ class VesselStore:
|
|||||||
mmsi_str = str(mmsi)
|
mmsi_str = str(mmsi)
|
||||||
if mmsi_str in self._tracks:
|
if mmsi_str in self._tracks:
|
||||||
combined = pd.concat([self._tracks[mmsi_str], group], ignore_index=True)
|
combined = pd.concat([self._tracks[mmsi_str], group], ignore_index=True)
|
||||||
combined = combined.drop_duplicates(subset=['timestamp'])
|
combined = combined.sort_values(['timestamp', 'time_bucket'])
|
||||||
|
combined = combined.drop_duplicates(subset=['timestamp'], keep='last')
|
||||||
self._tracks[mmsi_str] = combined.reset_index(drop=True)
|
self._tracks[mmsi_str] = combined.reset_index(drop=True)
|
||||||
else:
|
else:
|
||||||
self._tracks[mmsi_str] = group.reset_index(drop=True)
|
self._tracks[mmsi_str] = group.sort_values(['timestamp', 'time_bucket']).reset_index(drop=True)
|
||||||
|
|
||||||
if 'time_bucket' in group.columns and not group['time_bucket'].empty:
|
if 'time_bucket' in group.columns and not group['time_bucket'].empty:
|
||||||
bucket_vals = pd.to_datetime(group['time_bucket'].dropna())
|
bucket_vals = pd.to_datetime(group['time_bucket'].dropna())
|
||||||
@ -191,6 +192,10 @@ class VesselStore:
|
|||||||
"""Remove track points older than N hours and evict empty MMSI entries."""
|
"""Remove track points older than N hours and evict empty MMSI entries."""
|
||||||
import datetime as _dt
|
import datetime as _dt
|
||||||
|
|
||||||
|
from time_bucket import compute_initial_window_start, compute_safe_bucket
|
||||||
|
|
||||||
|
safe_bucket = compute_safe_bucket()
|
||||||
|
cutoff_bucket = compute_initial_window_start(hours, safe_bucket)
|
||||||
now = datetime.now(timezone.utc)
|
now = datetime.now(timezone.utc)
|
||||||
cutoff_aware = now - _dt.timedelta(hours=hours)
|
cutoff_aware = now - _dt.timedelta(hours=hours)
|
||||||
cutoff_naive = cutoff_aware.replace(tzinfo=None)
|
cutoff_naive = cutoff_aware.replace(tzinfo=None)
|
||||||
@ -200,12 +205,15 @@ class VesselStore:
|
|||||||
|
|
||||||
for mmsi in list(self._tracks.keys()):
|
for mmsi in list(self._tracks.keys()):
|
||||||
df = self._tracks[mmsi]
|
df = self._tracks[mmsi]
|
||||||
ts_col = df['timestamp']
|
if 'time_bucket' in df.columns and not df['time_bucket'].dropna().empty:
|
||||||
# Handle tz-aware and tz-naive timestamps uniformly
|
bucket_col = pd.to_datetime(df['time_bucket'], errors='coerce')
|
||||||
if hasattr(ts_col.dtype, 'tz') and ts_col.dtype.tz is not None:
|
mask = bucket_col >= pd.Timestamp(cutoff_bucket)
|
||||||
mask = ts_col >= pd.Timestamp(cutoff_aware)
|
|
||||||
else:
|
else:
|
||||||
mask = ts_col >= pd.Timestamp(cutoff_naive)
|
ts_col = df['timestamp']
|
||||||
|
if hasattr(ts_col.dtype, 'tz') and ts_col.dtype.tz is not None:
|
||||||
|
mask = ts_col >= pd.Timestamp(cutoff_aware)
|
||||||
|
else:
|
||||||
|
mask = ts_col >= pd.Timestamp(cutoff_naive)
|
||||||
filtered = df[mask].reset_index(drop=True)
|
filtered = df[mask].reset_index(drop=True)
|
||||||
if filtered.empty:
|
if filtered.empty:
|
||||||
del self._tracks[mmsi]
|
del self._tracks[mmsi]
|
||||||
@ -215,10 +223,11 @@ class VesselStore:
|
|||||||
|
|
||||||
after_total = sum(len(v) for v in self._tracks.values())
|
after_total = sum(len(v) for v in self._tracks.values())
|
||||||
logger.info(
|
logger.info(
|
||||||
'eviction complete: removed %d points, evicted %d mmsis (threshold=%dh)',
|
'eviction complete: removed %d points, evicted %d mmsis (threshold=%dh, cutoff_bucket=%s)',
|
||||||
before_total - after_total,
|
before_total - after_total,
|
||||||
len(evicted_mmsis),
|
len(evicted_mmsis),
|
||||||
hours,
|
hours,
|
||||||
|
cutoff_bucket,
|
||||||
)
|
)
|
||||||
|
|
||||||
def refresh_static_info(self) -> None:
|
def refresh_static_info(self) -> None:
|
||||||
|
|||||||
@ -25,6 +25,8 @@ class Settings(BaseSettings):
|
|||||||
INITIAL_LOAD_HOURS: int = 24
|
INITIAL_LOAD_HOURS: int = 24
|
||||||
STATIC_INFO_REFRESH_MIN: int = 60
|
STATIC_INFO_REFRESH_MIN: int = 60
|
||||||
PERMIT_REFRESH_MIN: int = 30
|
PERMIT_REFRESH_MIN: int = 30
|
||||||
|
SNPDB_SAFE_DELAY_MIN: int = 12
|
||||||
|
SNPDB_BACKFILL_BUCKETS: int = 3
|
||||||
|
|
||||||
# 파이프라인
|
# 파이프라인
|
||||||
TRAJECTORY_HOURS: int = 6
|
TRAJECTORY_HOURS: int = 6
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import psycopg2
|
|||||||
from psycopg2 import pool
|
from psycopg2 import pool
|
||||||
|
|
||||||
from config import settings
|
from config import settings
|
||||||
|
from time_bucket import compute_incremental_window_start, compute_initial_window_start, compute_safe_bucket
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -61,8 +62,12 @@ def fetch_all_tracks(hours: int = 24) -> pd.DataFrame:
|
|||||||
|
|
||||||
LineStringM 지오메트리에서 개별 포인트를 추출하며,
|
LineStringM 지오메트리에서 개별 포인트를 추출하며,
|
||||||
한국 해역(122-132E, 31-39N) 내 최근 N시간 데이터를 반환한다.
|
한국 해역(122-132E, 31-39N) 내 최근 N시간 데이터를 반환한다.
|
||||||
|
safe_bucket(현재 - 12분)까지만 조회하여 미완성 버킷을 제외한다.
|
||||||
"""
|
"""
|
||||||
query = f"""
|
safe_bucket = compute_safe_bucket()
|
||||||
|
window_start = compute_initial_window_start(hours, safe_bucket)
|
||||||
|
|
||||||
|
query = """
|
||||||
SELECT
|
SELECT
|
||||||
t.mmsi,
|
t.mmsi,
|
||||||
to_timestamp(ST_M((dp).geom)) as timestamp,
|
to_timestamp(ST_M((dp).geom)) as timestamp,
|
||||||
@ -75,18 +80,21 @@ def fetch_all_tracks(hours: int = 24) -> pd.DataFrame:
|
|||||||
END as raw_sog
|
END as raw_sog
|
||||||
FROM signal.t_vessel_tracks_5min t,
|
FROM signal.t_vessel_tracks_5min t,
|
||||||
LATERAL ST_DumpPoints(t.track_geom) dp
|
LATERAL ST_DumpPoints(t.track_geom) dp
|
||||||
WHERE t.time_bucket >= NOW() - INTERVAL '{hours} hours'
|
WHERE t.time_bucket >= %s
|
||||||
|
AND t.time_bucket <= %s
|
||||||
AND t.track_geom && ST_MakeEnvelope(122, 31, 132, 39, 4326)
|
AND t.track_geom && ST_MakeEnvelope(122, 31, 132, 39, 4326)
|
||||||
ORDER BY t.mmsi, to_timestamp(ST_M((dp).geom))
|
ORDER BY t.mmsi, to_timestamp(ST_M((dp).geom))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
df = pd.read_sql_query(query, conn)
|
df = pd.read_sql_query(query, conn, params=(window_start, safe_bucket))
|
||||||
logger.info(
|
logger.info(
|
||||||
'fetch_all_tracks: %d rows, %d vessels (last %dh)',
|
'fetch_all_tracks: %d rows, %d vessels (window=%s..%s, last %dh safe)',
|
||||||
len(df),
|
len(df),
|
||||||
df['mmsi'].nunique() if len(df) > 0 else 0,
|
df['mmsi'].nunique() if len(df) > 0 else 0,
|
||||||
|
window_start,
|
||||||
|
safe_bucket,
|
||||||
hours,
|
hours,
|
||||||
)
|
)
|
||||||
return df
|
return df
|
||||||
@ -98,9 +106,20 @@ def fetch_all_tracks(hours: int = 24) -> pd.DataFrame:
|
|||||||
def fetch_incremental(last_bucket: datetime) -> pd.DataFrame:
|
def fetch_incremental(last_bucket: datetime) -> pd.DataFrame:
|
||||||
"""last_bucket 이후의 신규 궤적 포인트를 조회한다.
|
"""last_bucket 이후의 신규 궤적 포인트를 조회한다.
|
||||||
|
|
||||||
스케줄러 증분 업데이트에 사용되며, time_bucket > last_bucket 조건으로
|
safe_bucket(현재 - 12분)까지만 조회하여 미완성 버킷을 제외하고,
|
||||||
이미 처리한 버킷을 건너뛴다.
|
from_bucket(last_bucket - 15분)부터 재수집하여 지연 INSERT를 보상한다.
|
||||||
"""
|
"""
|
||||||
|
safe_bucket = compute_safe_bucket()
|
||||||
|
from_bucket = compute_incremental_window_start(last_bucket)
|
||||||
|
if safe_bucket <= from_bucket:
|
||||||
|
logger.info(
|
||||||
|
'fetch_incremental skipped: safe_bucket=%s, from_bucket=%s, last_bucket=%s',
|
||||||
|
safe_bucket,
|
||||||
|
from_bucket,
|
||||||
|
last_bucket,
|
||||||
|
)
|
||||||
|
return pd.DataFrame(columns=['mmsi', 'timestamp', 'lat', 'lon', 'raw_sog'])
|
||||||
|
|
||||||
query = """
|
query = """
|
||||||
SELECT
|
SELECT
|
||||||
t.mmsi,
|
t.mmsi,
|
||||||
@ -115,17 +134,20 @@ def fetch_incremental(last_bucket: datetime) -> pd.DataFrame:
|
|||||||
FROM signal.t_vessel_tracks_5min t,
|
FROM signal.t_vessel_tracks_5min t,
|
||||||
LATERAL ST_DumpPoints(t.track_geom) dp
|
LATERAL ST_DumpPoints(t.track_geom) dp
|
||||||
WHERE t.time_bucket > %s
|
WHERE t.time_bucket > %s
|
||||||
|
AND t.time_bucket <= %s
|
||||||
AND t.track_geom && ST_MakeEnvelope(122, 31, 132, 39, 4326)
|
AND t.track_geom && ST_MakeEnvelope(122, 31, 132, 39, 4326)
|
||||||
ORDER BY t.mmsi, to_timestamp(ST_M((dp).geom))
|
ORDER BY t.mmsi, to_timestamp(ST_M((dp).geom))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
df = pd.read_sql_query(query, conn, params=(last_bucket,))
|
df = pd.read_sql_query(query, conn, params=(from_bucket, safe_bucket))
|
||||||
logger.info(
|
logger.info(
|
||||||
'fetch_incremental: %d rows, %d vessels (since %s)',
|
'fetch_incremental: %d rows, %d vessels (from %s, safe %s, last %s)',
|
||||||
len(df),
|
len(df),
|
||||||
df['mmsi'].nunique() if len(df) > 0 else 0,
|
df['mmsi'].nunique() if len(df) > 0 else 0,
|
||||||
|
from_bucket.isoformat(),
|
||||||
|
safe_bucket.isoformat(),
|
||||||
last_bucket.isoformat(),
|
last_bucket.isoformat(),
|
||||||
)
|
)
|
||||||
return df
|
return df
|
||||||
|
|||||||
42
prediction/time_bucket.py
Normal file
42
prediction/time_bucket.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
|
from config import settings
|
||||||
|
|
||||||
|
_KST = ZoneInfo('Asia/Seoul')
|
||||||
|
_BUCKET_MINUTES = 5
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_bucket_kst(bucket: datetime) -> datetime:
|
||||||
|
if bucket.tzinfo is None:
|
||||||
|
return bucket
|
||||||
|
return bucket.astimezone(_KST).replace(tzinfo=None)
|
||||||
|
|
||||||
|
|
||||||
|
def floor_bucket_kst(value: datetime, bucket_minutes: int = _BUCKET_MINUTES) -> datetime:
|
||||||
|
if value.tzinfo is None:
|
||||||
|
localized = value.replace(tzinfo=_KST)
|
||||||
|
else:
|
||||||
|
localized = value.astimezone(_KST)
|
||||||
|
floored_minute = (localized.minute // bucket_minutes) * bucket_minutes
|
||||||
|
return localized.replace(minute=floored_minute, second=0, microsecond=0)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_safe_bucket(now: datetime | None = None) -> datetime:
|
||||||
|
current = now or datetime.now(timezone.utc)
|
||||||
|
if current.tzinfo is None:
|
||||||
|
current = current.replace(tzinfo=timezone.utc)
|
||||||
|
safe_point = current.astimezone(_KST) - timedelta(minutes=settings.SNPDB_SAFE_DELAY_MIN)
|
||||||
|
return floor_bucket_kst(safe_point).replace(tzinfo=None)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_initial_window_start(hours: int, safe_bucket: datetime | None = None) -> datetime:
|
||||||
|
anchor = normalize_bucket_kst(safe_bucket or compute_safe_bucket())
|
||||||
|
return anchor - timedelta(hours=hours)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_incremental_window_start(last_bucket: datetime) -> datetime:
|
||||||
|
normalized = normalize_bucket_kst(last_bucket)
|
||||||
|
return normalized - timedelta(minutes=settings.SNPDB_BACKFILL_BUCKETS * _BUCKET_MINUTES)
|
||||||
불러오는 중...
Reference in New Issue
Block a user