chore: Kafka topic명 변경 및 SignalKind 수집 스크립트 추가

- tp_SNP_AIS_Signal → tp_Global_AIS_Signal (3개 프로파일)
- scripts/collect_signalkind_candidates.sh 추가

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
htlee 2026-02-14 21:52:50 +09:00
부모 178ac506bf
커밋 290933f94f
4개의 변경된 파일434개의 추가작업 그리고 3개의 파일을 삭제

파일 보기

@ -0,0 +1,431 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
PROFILE="local"
DURATION_SEC=120
MAX_MESSAGES=200000
GROUP_ID="signalkind-collector-v1"
OFFSET_RESET="latest"
OUTPUT_DIR="$ROOT_DIR/docs/signalkind"
usage() {
cat <<'USAGE'
Usage: collect_signalkind_candidates.sh [options]
Options:
-p, --profile <name> Spring profile (default: local)
-d, --duration-sec <sec> Consume duration seconds (default: 120)
-m, --max-messages <count> Max messages per run (default: 200000)
-g, --group-id <id> Kafka consumer group id (default: signalkind-collector-v1)
-r, --offset-reset <value> auto.offset.reset: earliest|latest (default: latest)
-o, --output-dir <dir> Output directory (default: docs/signalkind)
-h, --help Show this help
Examples:
scripts/collect_signalkind_candidates.sh -p local -d 600
scripts/collect_signalkind_candidates.sh -p local -r earliest -m 50000
USAGE
}
while [[ $# -gt 0 ]]; do
case "$1" in
-p|--profile)
PROFILE="$2"
shift 2
;;
-d|--duration-sec)
DURATION_SEC="$2"
shift 2
;;
-m|--max-messages)
MAX_MESSAGES="$2"
shift 2
;;
-g|--group-id)
GROUP_ID="$2"
shift 2
;;
-r|--offset-reset)
OFFSET_RESET="$2"
shift 2
;;
-o|--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
if [[ "$OFFSET_RESET" != "earliest" && "$OFFSET_RESET" != "latest" ]]; then
echo "Invalid --offset-reset value: $OFFSET_RESET"
exit 1
fi
CONFIG_FILE="$ROOT_DIR/src/main/resources/application-${PROFILE}.yml"
if [[ ! -f "$CONFIG_FILE" ]]; then
echo "Profile config not found: $CONFIG_FILE"
exit 1
fi
BOOTSTRAP_SERVERS="$(awk '/^[[:space:]]*bootstrap-servers:/{print $2; exit}' "$CONFIG_FILE")"
TOPIC_NAME="$(awk '/^[[:space:]]*topic:/{print $2; exit}' "$CONFIG_FILE")"
if [[ -z "${BOOTSTRAP_SERVERS:-}" || -z "${TOPIC_NAME:-}" ]]; then
echo "Failed to read bootstrap/topic from $CONFIG_FILE"
exit 1
fi
mkdir -p "$OUTPUT_DIR"
CP_FILE="/tmp/snp-signalkind-cp.txt"
mvn -q -DskipTests dependency:build-classpath -Dmdep.outputFile="$CP_FILE"
CLASSPATH="$(cat "$CP_FILE")"
JAVA_FILE="/tmp/SignalkindCollector.java"
cat >"$JAVA_FILE" <<'JAVA'
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.*;
import java.util.stream.Collectors;
public class SignalkindCollector {
@JsonIgnoreProperties(ignoreUnknown = true)
public static class Aggregate {
public long totalConsumed = 0;
public long totalParseError = 0;
public long totalMissingVesselType = 0;
public Map<String, KindStat> vesselTypeStats = new HashMap<>();
public Map<String, KindStat> vesselTypeExtraStats = new HashMap<>();
}
@JsonIgnoreProperties(ignoreUnknown = true)
public static class KindStat {
public long count = 0;
public long lastTimestampMs = 0;
public LinkedHashSet<String> sampleMmsi = new LinkedHashSet<>();
}
public static class DecisionFields {
String proposedCode = "";
String status = "PENDING";
String notes = "";
}
public static void main(String[] args) throws Exception {
if (args.length < 7) {
throw new IllegalArgumentException(
"Usage: SignalkindCollector <bootstrap> <topic> <outputDir> <durationSec> <maxMessages> <groupId> <offsetReset>");
}
String bootstrap = args[0];
String topic = args[1];
Path outputDir = Path.of(args[2]);
int durationSec = Integer.parseInt(args[3]);
int maxMessages = Integer.parseInt(args[4]);
String groupId = args[5];
String offsetReset = args[6];
Files.createDirectories(outputDir);
ObjectMapper mapper = new ObjectMapper();
Path aggregateJsonPath = outputDir.resolve("aggregate_store.json");
Path vesselTypeTsvPath = outputDir.resolve("vesseltype_stats.tsv");
Path vesselTypeExtraTsvPath = outputDir.resolve("vesseltype_extra_stats.tsv");
Path mappingDraftTsvPath = outputDir.resolve("signalkind_mapping_draft.tsv");
Path runSummaryPath = outputDir.resolve("last_run_summary.txt");
Aggregate aggregate = loadAggregate(mapper, aggregateJsonPath);
Map<String, DecisionFields> existingDraft = loadExistingDraft(mappingDraftTsvPath);
Properties props = new Properties();
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrap);
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetReset);
props.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, "2000");
props.put(ConsumerConfig.REQUEST_TIMEOUT_MS_CONFIG, "30000");
long runStart = System.currentTimeMillis();
long runDeadline = runStart + (durationSec * 1000L);
long runConsumed = 0;
long runParseError = 0;
long runMissingVesselType = 0;
try (KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props)) {
consumer.subscribe(Collections.singleton(topic));
while (System.currentTimeMillis() < runDeadline && runConsumed < maxMessages) {
ConsumerRecords<String, String> records = consumer.poll(java.time.Duration.ofMillis(1000));
if (records.isEmpty()) {
continue;
}
for (ConsumerRecord<String, String> record : records) {
if (runConsumed >= maxMessages) {
break;
}
runConsumed++;
try {
JsonNode root = mapper.readTree(record.value());
JsonNode payload = root.path("payload");
String vesselType = normalize(payload.path("vesselType").asText(null), "N/A");
String extraInfo = normalize(payload.path("extraInfo").asText(null), "N/A");
String mmsi = normalize(payload.path("mmsi").asText(null), "");
if ("N/A".equals(vesselType)) {
runMissingVesselType++;
}
updateStat(aggregate.vesselTypeStats, vesselType, record.timestamp(), mmsi);
updateStat(aggregate.vesselTypeExtraStats, vesselType + "\u001F" + extraInfo, record.timestamp(), mmsi);
} catch (Exception e) {
runParseError++;
}
}
consumer.commitSync();
}
}
aggregate.totalConsumed += runConsumed;
aggregate.totalParseError += runParseError;
aggregate.totalMissingVesselType += runMissingVesselType;
mapper.writerWithDefaultPrettyPrinter().writeValue(aggregateJsonPath.toFile(), aggregate);
writeVesselTypeStats(aggregate, vesselTypeTsvPath);
writeVesselTypeExtraStats(aggregate, vesselTypeExtraTsvPath);
writeMappingDraft(aggregate, existingDraft, mappingDraftTsvPath);
writeRunSummary(runSummaryPath, bootstrap, topic, groupId, offsetReset,
runStart, System.currentTimeMillis(), runConsumed, runParseError, runMissingVesselType, aggregate.totalConsumed);
System.out.println("OUTPUT_DIR=" + outputDir);
System.out.println("RUN_CONSUMED=" + runConsumed);
System.out.println("RUN_PARSE_ERROR=" + runParseError);
System.out.println("RUN_MISSING_VESSEL_TYPE=" + runMissingVesselType);
System.out.println("TOTAL_CONSUMED=" + aggregate.totalConsumed);
System.out.println("TOTAL_VESSEL_TYPES=" + aggregate.vesselTypeStats.size());
System.out.println("TOTAL_VESSEL_TYPE_EXTRA=" + aggregate.vesselTypeExtraStats.size());
}
private static Aggregate loadAggregate(ObjectMapper mapper, Path aggregateJsonPath) {
try {
if (Files.exists(aggregateJsonPath)) {
return mapper.readValue(aggregateJsonPath.toFile(), Aggregate.class);
}
} catch (Exception ignored) {
}
return new Aggregate();
}
private static Map<String, DecisionFields> loadExistingDraft(Path mappingDraftTsvPath) {
Map<String, DecisionFields> map = new HashMap<>();
if (!Files.exists(mappingDraftTsvPath)) {
return map;
}
try {
List<String> lines = Files.readAllLines(mappingDraftTsvPath, StandardCharsets.UTF_8);
boolean first = true;
for (String line : lines) {
if (first) {
first = false;
continue;
}
String[] arr = line.split("\t", -1);
if (arr.length < 8) {
continue;
}
String key = arr[0] + "\u001F" + arr[1];
DecisionFields fields = new DecisionFields();
fields.proposedCode = arr[5];
fields.status = arr[6].isBlank() ? "PENDING" : arr[6];
fields.notes = arr[7];
map.put(key, fields);
}
} catch (Exception ignored) {
}
return map;
}
private static void updateStat(Map<String, KindStat> map, String key, long ts, String mmsi) {
KindStat stat = map.computeIfAbsent(key, k -> new KindStat());
stat.count += 1;
stat.lastTimestampMs = Math.max(stat.lastTimestampMs, ts);
if (mmsi != null && !mmsi.isBlank() && stat.sampleMmsi.size() < 5) {
stat.sampleMmsi.add(mmsi);
}
}
private static String normalize(String value, String defaultValue) {
if (value == null) {
return defaultValue;
}
String v = value.trim();
if (v.isEmpty() || "null".equalsIgnoreCase(v)) {
return defaultValue;
}
return v;
}
private static void writeVesselTypeStats(Aggregate aggregate, Path path) throws Exception {
List<Map.Entry<String, KindStat>> rows = aggregate.vesselTypeStats.entrySet()
.stream()
.sorted((a, b) -> Long.compare(b.getValue().count, a.getValue().count))
.toList();
try (BufferedWriter w = new BufferedWriter(new FileWriter(path.toFile(), false))) {
w.write("vesselType\tcount\tratio\tlastSeenUtc\tsampleMmsi\n");
long total = Math.max(1L, aggregate.totalConsumed);
for (Map.Entry<String, KindStat> row : rows) {
KindStat s = row.getValue();
double ratio = (s.count * 100.0) / total;
w.write(row.getKey() + "\t"
+ s.count + "\t"
+ String.format(Locale.US, "%.4f", ratio) + "\t"
+ formatTs(s.lastTimestampMs) + "\t"
+ String.join(",", s.sampleMmsi) + "\n");
}
}
}
private static void writeVesselTypeExtraStats(Aggregate aggregate, Path path) throws Exception {
List<Map.Entry<String, KindStat>> rows = aggregate.vesselTypeExtraStats.entrySet()
.stream()
.sorted((a, b) -> Long.compare(b.getValue().count, a.getValue().count))
.toList();
try (BufferedWriter w = new BufferedWriter(new FileWriter(path.toFile(), false))) {
w.write("vesselType\textraInfo\tcount\tratio\tlastSeenUtc\tsampleMmsi\n");
long total = Math.max(1L, aggregate.totalConsumed);
for (Map.Entry<String, KindStat> row : rows) {
String[] keys = row.getKey().split("\u001F", 2);
String vesselType = keys.length > 0 ? keys[0] : "N/A";
String extraInfo = keys.length > 1 ? keys[1] : "N/A";
KindStat s = row.getValue();
double ratio = (s.count * 100.0) / total;
w.write(vesselType + "\t"
+ extraInfo + "\t"
+ s.count + "\t"
+ String.format(Locale.US, "%.4f", ratio) + "\t"
+ formatTs(s.lastTimestampMs) + "\t"
+ String.join(",", s.sampleMmsi) + "\n");
}
}
}
private static void writeMappingDraft(Aggregate aggregate, Map<String, DecisionFields> existing, Path path) throws Exception {
List<Map.Entry<String, KindStat>> rows = aggregate.vesselTypeExtraStats.entrySet()
.stream()
.sorted((a, b) -> Long.compare(b.getValue().count, a.getValue().count))
.toList();
try (BufferedWriter w = new BufferedWriter(new FileWriter(path.toFile(), false))) {
w.write("vesselType\textraInfo\tcount\tlastSeenUtc\tsampleMmsi\tproposedSignalKindCode\tdecisionStatus\tnotes\n");
for (Map.Entry<String, KindStat> row : rows) {
String key = row.getKey();
String[] keys = key.split("\u001F", 2);
String vesselType = keys.length > 0 ? keys[0] : "N/A";
String extraInfo = keys.length > 1 ? keys[1] : "N/A";
KindStat s = row.getValue();
DecisionFields d = existing.getOrDefault(key, new DecisionFields());
w.write(vesselType + "\t"
+ extraInfo + "\t"
+ s.count + "\t"
+ formatTs(s.lastTimestampMs) + "\t"
+ String.join(",", s.sampleMmsi) + "\t"
+ d.proposedCode + "\t"
+ d.status + "\t"
+ d.notes + "\n");
}
}
}
private static void writeRunSummary(
Path path,
String bootstrap,
String topic,
String groupId,
String offsetReset,
long startMs,
long endMs,
long runConsumed,
long runParseError,
long runMissingVesselType,
long totalConsumed
) throws Exception {
try (BufferedWriter w = new BufferedWriter(new FileWriter(path.toFile(), false))) {
w.write("bootstrap=" + bootstrap + "\n");
w.write("topic=" + topic + "\n");
w.write("groupId=" + groupId + "\n");
w.write("offsetReset=" + offsetReset + "\n");
w.write("runStartUtc=" + Instant.ofEpochMilli(startMs) + "\n");
w.write("runEndUtc=" + Instant.ofEpochMilli(endMs) + "\n");
w.write("runDurationSec=" + ((endMs - startMs) / 1000) + "\n");
w.write("runConsumed=" + runConsumed + "\n");
w.write("runParseError=" + runParseError + "\n");
w.write("runMissingVesselType=" + runMissingVesselType + "\n");
w.write("totalConsumed=" + totalConsumed + "\n");
}
}
private static String formatTs(long ts) {
if (ts <= 0) {
return "";
}
return OffsetDateTime.ofInstant(Instant.ofEpochMilli(ts), ZoneOffset.UTC).toString();
}
}
JAVA
javac -cp "$CLASSPATH" "$JAVA_FILE"
java -cp "$CLASSPATH:/tmp" SignalkindCollector \
"$BOOTSTRAP_SERVERS" \
"$TOPIC_NAME" \
"$OUTPUT_DIR" \
"$DURATION_SEC" \
"$MAX_MESSAGES" \
"$GROUP_ID" \
"$OFFSET_RESET"
echo "[DONE] Output files:"
echo " - $OUTPUT_DIR/aggregate_store.json"
echo " - $OUTPUT_DIR/vesseltype_stats.tsv"
echo " - $OUTPUT_DIR/vesseltype_extra_stats.tsv"
echo " - $OUTPUT_DIR/signalkind_mapping_draft.tsv"
echo " - $OUTPUT_DIR/last_run_summary.txt"

파일 보기

@ -118,7 +118,7 @@ app:
cron: "15 * * * * ?" # 매 분 15초 실행
kafka:
enabled: true
topic: tp_SNP_AIS_Signal
topic: tp_Global_AIS_Signal
send-chunk-size: 5000
fail-on-send-error: false
# AIS Target 캐시 설정

파일 보기

@ -120,7 +120,7 @@ app:
cron: "15 * * * * ?" # 매 분 15초 실행
kafka:
enabled: true
topic: tp_SNP_AIS_Signal
topic: tp_Global_AIS_Signal
send-chunk-size: 5000
fail-on-send-error: false
# AIS Target 캐시 설정

파일 보기

@ -170,7 +170,7 @@ app:
cron: "15 * * * * ?" # 매 분 15초 실행
kafka:
enabled: true
topic: tp_SNP_AIS_Signal
topic: tp_Global_AIS_Signal
send-chunk-size: 5000
fail-on-send-error: false