Skip to content

Commit 2237308

Browse files
authored
wordcloudの固有名詞等を学習する (#10)
1 parent 109210c commit 2237308

18 files changed

Lines changed: 1701 additions & 596 deletions

.python-version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.11
1+
3.14.3

Dockerfile

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,32 @@
1-
FROM ghcr.io/astral-sh/uv:debian-slim
1+
FROM ghcr.io/astral-sh/uv:python3.14-trixie-slim
22

33
WORKDIR /app
44

5-
RUN apt update
6-
RUN apt install -y ca-certificates
5+
# 1. ビルドに必要なツール一式をインストール
6+
# gcc と libc6-dev がないと、Rustコンパイラがあってもビルドに失敗します
7+
RUN apt-get update && apt-get install -y --no-install-recommends \
8+
ca-certificates \
9+
gcc \
10+
libc6-dev \
11+
rustc \
12+
cargo \
13+
&& rm -rf /var/lib/apt/lists/*
714

8-
# 1. Pythonの出力をリアルタイムで表示させる設定(重要)
15+
# 2. 環境設定
916
ENV PYTHONUNBUFFERED=1
10-
# 2. .pycファイルを作成しない設定(コンテナを軽量に保つ)
1117
ENV PYTHONDONTWRITEBYTECODE=1
18+
ENV PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
1219

20+
# 3. 依存関係のインストール
1321
COPY pyproject.toml .
1422
COPY uv.lock* ./
1523

16-
# Disable development dependencies
1724
ENV UV_NO_DEV=1
18-
RUN uv sync --locked
25+
# --locked を使いつつ同期
26+
RUN uv sync --locked --no-install-project
1927

2028
COPY . .
2129

30+
RUN uv sync --locked
31+
2232
CMD ["uv", "run", "/app/src/main.py"]

debug_sudachi.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
"""Debug script to examine Sudachi tokenization and POS tags."""
3+
4+
from sudachipy import dictionary, tokenizer
5+
6+
tokenizer_obj = dictionary.Dictionary().create()
7+
MODE = tokenizer.Tokenizer.SplitMode.C
8+
9+
test_texts = [
10+
"ミラノ風ドリア",
11+
"ナポリタン",
12+
"オムレツ",
13+
"塩辛い",
14+
"風",
15+
"式",
16+
"的",
17+
]
18+
19+
print("=" * 80)
20+
print("Sudachi Tokenization Analysis")
21+
print("=" * 80)
22+
23+
for text in test_texts:
24+
print(f"\nText: 『{text}』")
25+
print("-" * 60)
26+
tokens = tokenizer_obj.tokenize(text, MODE)
27+
28+
for i, token in enumerate(tokens):
29+
surface = token.surface()
30+
pos = token.part_of_speech()
31+
print(f" Token[{i}]: '{surface}'")
32+
print(f" POS[0] (major): {pos[0]}")
33+
print(f" POS[1] (minor): {pos[1]}")
34+
if len(pos) > 2:
35+
print(f" POS[2]: {pos[2]}")
36+
print()
37+
38+
print("=" * 80)

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@ readme = "README.md"
66
requires-python = ">=3.11"
77
dependencies = [
88
"discord-py>=2.7.1",
9-
"janome>=0.5.0",
109
"matplotlib>=3.10.8",
1110
"networkx>=3.6.1",
1211
"pymongo>=4.16.0",
1312
"scipy>=1.17.1",
13+
"sudachidict-core>=20260116",
14+
"sudachipy>=0.6.10",
1415
"wordcloud>=1.9.6",
1516
]
1617

1718
[dependency-groups]
1819
dev = [
1920
"python-dotenv>=1.2.2",
21+
"pytest>=9.0.2",
2022
]

src/cogs/network.py

Lines changed: 30 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,15 @@
22
from discord import app_commands
33
from discord.ext import commands
44
from typing import Optional
5-
from datetime import timedelta
6-
from collections import defaultdict
75

8-
from libs.visualize import generate_conversation_network
96
from libs.embed import EmbedHelper
7+
from libs.network_service import (
8+
build_node_labels,
9+
build_conversation_edges,
10+
fetch_network_documents,
11+
generate_conversation_network,
12+
)
13+
from libs.wordcloud_service import parse_period_days
1014

1115

1216
class ConversationNetwork(commands.Cog):
@@ -55,20 +59,9 @@ async def generate_network(
5559
await interaction.response.send_message(embed=embed, ephemeral=True)
5660
return
5761

58-
# period
59-
period_filter = {}
6062
if period:
6163
try:
62-
period_days = int(period)
63-
if period_days <= 0:
64-
raise ValueError
65-
period_filter = {
66-
"timestamp": {
67-
"$gte": (
68-
discord.utils.utcnow() - timedelta(days=period_days)
69-
).isoformat()
70-
}
71-
}
64+
period_days = parse_period_days(period)
7265
except ValueError:
7366
embed = embed_helper.create_error_embed(
7467
title="エラー",
@@ -84,35 +77,19 @@ async def generate_network(
8477
await interaction.response.send_message(embed=embed, ephemeral=True)
8578
print(f"Error processing network period: {e}")
8679
return
87-
88-
user_filter = {}
89-
if user:
90-
user_filter = {"user_id": str(user.id)}
91-
92-
channel_filter = {}
93-
if channel:
94-
channel_filter = {"channel_id": str(channel.id)}
80+
else:
81+
period_days = None
9582

9683
await interaction.response.defer(thinking=True)
9784

9885
try:
99-
docs = list(
100-
self.bot.db.messages.find(
101-
{
102-
"guild_id": str(interaction.guild_id),
103-
**period_filter,
104-
**user_filter,
105-
**channel_filter,
106-
},
107-
{
108-
"message_id": 1,
109-
"user_id": 1,
110-
"reply_to": 1,
111-
"mentions": 1,
112-
},
113-
)
114-
.sort("timestamp", -1)
115-
.limit(self.MAX_MESSAGE_COUNT)
86+
docs = fetch_network_documents(
87+
self.bot.db,
88+
str(interaction.guild_id),
89+
period_days=period_days,
90+
user_id=str(user.id) if user else None,
91+
channel_id=str(channel.id) if channel else None,
92+
limit=self.MAX_MESSAGE_COUNT,
11693
)
11794
except Exception as e:
11895
embed = embed_helper.create_error_embed(
@@ -131,69 +108,17 @@ async def generate_network(
131108
await interaction.followup.send(embed=embed)
132109
return
133110

134-
# message map
135-
valid_docs = []
136-
invalid_doc_count = 0
137-
138-
for doc in docs:
139-
message_id = doc.get("message_id")
140-
author_id = doc.get("user_id")
141-
142-
if message_id is None or author_id is None:
143-
invalid_doc_count += 1
144-
continue
111+
edges, invalid_doc_count = build_conversation_edges(docs)
112+
valid_doc_count = len(docs) - invalid_doc_count
145113

146-
valid_docs.append(
147-
{
148-
"message_id": str(message_id),
149-
"user_id": str(author_id),
150-
"reply_to": (
151-
str(doc["reply_to"])
152-
if doc.get("reply_to") is not None
153-
else None
154-
),
155-
"mentions": [
156-
str(mentioned)
157-
for mentioned in doc.get("mentions", [])
158-
if mentioned is not None
159-
],
160-
}
161-
)
162-
163-
if not valid_docs:
114+
if valid_doc_count <= 0:
164115
embed = embed_helper.create_warning_embed(
165116
title="データ不足",
166117
description="解析に使えるメッセージがありませんでした。",
167118
)
168119
await interaction.followup.send(embed=embed)
169120
return
170121

171-
msg_map = {doc["message_id"]: doc for doc in valid_docs}
172-
173-
edges = defaultdict(int)
174-
175-
for msg in valid_docs:
176-
177-
author = msg.get("user_id")
178-
if author is None:
179-
continue
180-
181-
# reply
182-
reply_to = msg.get("reply_to")
183-
if reply_to and reply_to in msg_map:
184-
other = msg_map[reply_to].get("user_id")
185-
if author != other:
186-
edges[tuple(sorted([author, other]))] += 1
187-
188-
# mention
189-
mentions = msg.get("mentions", [])
190-
if not isinstance(mentions, list):
191-
continue
192-
193-
for mentioned in mentions:
194-
if mentioned != author:
195-
edges[tuple(sorted([author, mentioned]))] += 1
196-
197122
if not edges:
198123
embed = embed_helper.create_warning_embed(
199124
title="会話不足",
@@ -202,29 +127,16 @@ async def generate_network(
202127
await interaction.followup.send(embed=embed)
203128
return
204129

205-
# user id → name
206-
user_map = {}
207-
208-
for a, b in edges.keys():
209-
if a not in user_map:
210-
try:
211-
member = interaction.guild.get_member(int(a))
212-
user_map[a] = member.display_name if member else a
213-
except (TypeError, ValueError):
214-
user_map[a] = a
215-
216-
if b not in user_map:
217-
try:
218-
member = interaction.guild.get_member(int(b))
219-
user_map[b] = member.display_name if member else b
220-
except (TypeError, ValueError):
221-
user_map[b] = b
130+
def resolve_name(user_id: str) -> str:
131+
try:
132+
member = interaction.guild.get_member(int(user_id))
133+
return member.display_name if member else user_id
134+
except (TypeError, ValueError):
135+
return user_id
222136

223-
named_edges = {
224-
(user_map[a], user_map[b]): count for (a, b), count in edges.items()
225-
}
137+
node_labels = build_node_labels(edges, resolve_name)
226138

227-
if not named_edges:
139+
if not node_labels:
228140
embed = embed_helper.create_warning_embed(
229141
title="会話不足",
230142
description="ネットワーク図に変換できるデータがありませんでした。",
@@ -233,7 +145,7 @@ async def generate_network(
233145
return
234146

235147
try:
236-
image_buffer = generate_conversation_network(named_edges)
148+
image_buffer = generate_conversation_network(edges, labels=node_labels)
237149
except ValueError:
238150
embed = embed_helper.create_warning_embed(
239151
title="会話不足",
@@ -260,7 +172,7 @@ async def generate_network(
260172
embed = embed_helper.create_success_embed(
261173
title="会話ネットワーク生成",
262174
description=(
263-
f"{len(valid_docs)}件のメッセージを解析しました"
175+
f"{valid_doc_count}件のメッセージを解析しました"
264176
+ (
265177
f"\n不正データ {invalid_doc_count}件 は自動でスキップしました"
266178
if invalid_doc_count

src/cogs/optout.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from discord.ext import commands
44
from discord import app_commands
55
from libs.embed import EmbedHelper
6+
from libs.message_store import delete_messages_by_query
67

78

89
class Optout(commands.Cog):
@@ -16,10 +17,13 @@ def __init__(self, bot: commands.Bot):
1617

1718
async def _delete_messages_background(self, query: dict, scope: str):
1819
try:
19-
# pymongo is synchronous, so run in a thread to avoid blocking the event loop.
20-
result = await asyncio.to_thread(self.bot.db.messages.delete_many, query)
20+
deleted_count = await asyncio.to_thread(
21+
delete_messages_by_query,
22+
self.bot.db,
23+
query,
24+
)
2125
print(
22-
f"[Optout] Background recursive delete completed for {scope}, deleted={result.deleted_count}"
26+
f"[Optout] Background recursive delete completed for {scope}, deleted={deleted_count}"
2327
)
2428
except Exception as e:
2529
print(f"[Optout] Background recursive delete failed for {scope}: {e}")

0 commit comments

Comments
 (0)