Skip to content

Commit e547108

Browse files
authored
Merge pull request #182 from BCSDLab/develop
Master <- Develop
2 parents 9382a2f + 4839570 commit e547108

3 files changed

Lines changed: 109 additions & 1 deletion

File tree

crawling/koreatech_article/article.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
from bs4 import BeautifulSoup, Comment
99
import urllib3
1010
import pymysql
11+
import uuid
1112

1213
from delete_article import delete_article
13-
from table import replace_table
14+
from table import replace_table, upload_txt
1415
from login_v2 import login
1516
from login import get_jwt_token
1617
from slack_notice import filter_nas, notice_to_slack
@@ -285,6 +286,11 @@ def crawling_article(board: Board, host: str, url: str) -> Article:
285286
# 표 처리
286287
# content = replace_table(content, board, num)
287288

289+
# content s3 업로드 및 url 삽입
290+
random_uuid = str(uuid.uuid4().hex)
291+
file_name = f'articles/content/board_{board.id}/{random_uuid}.txt'
292+
content = upload_txt(file_name=file_name, text_content=content)
293+
288294
# ===== 첨부 파일 =====
289295
attachment = list(map(
290296
lambda tag: Attachment(
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import pymysql
2+
import urllib3
3+
import uuid
4+
5+
from config import MYSQL_CONFIG
6+
from table import upload_txt
7+
8+
9+
def connect_db():
10+
urllib3.disable_warnings()
11+
conn = pymysql.connect(host=MYSQL_CONFIG['host'],
12+
port=MYSQL_CONFIG['port'],
13+
user=MYSQL_CONFIG['user'],
14+
password=MYSQL_CONFIG['password'],
15+
db=MYSQL_CONFIG['db'],
16+
charset='utf8')
17+
return conn
18+
19+
20+
def convert_content_to_url(connection):
21+
cur = connection.cursor()
22+
batch_size = 500
23+
last_id = 0
24+
total_articles = 0
25+
26+
try:
27+
while True:
28+
cur.execute("""
29+
SELECT `id`, `board_id`, `content`
30+
FROM `new_articles`
31+
WHERE `content` IS NOT NULL
32+
AND `board_id` != 14
33+
AND `id` > %s
34+
ORDER BY `id` ASC
35+
LIMIT %s
36+
""", (last_id, batch_size))
37+
38+
articles = cur.fetchall()
39+
if not articles:
40+
break
41+
42+
for article in articles:
43+
article_id, board_id, content = article
44+
random_uuid = str(uuid.uuid4().hex)
45+
file_name = f'articles/content/board_{board_id}/{random_uuid}.txt'
46+
content_url = upload_txt(file_name=file_name, text_content=content)
47+
48+
update_cur = connection.cursor()
49+
update_cur.execute("""
50+
UPDATE `new_articles`
51+
SET `content` = %s
52+
WHERE `id` = %s
53+
""", (content_url, article_id))
54+
print(f"article {article_id} url: {content_url}")
55+
update_cur.close()
56+
57+
total_articles += 1
58+
59+
last_id = articles[-1][0]
60+
connection.commit()
61+
62+
except Exception as error:
63+
connection.rollback()
64+
print(error)
65+
finally:
66+
cur.close()
67+
print(f"total articles: {total_articles}")
68+
69+
70+
if __name__ == "__main__":
71+
connection = None
72+
try:
73+
connection = connect_db()
74+
convert_content_to_url(connection)
75+
except Exception as error:
76+
print(error)
77+
finally:
78+
connection.close()

crawling/koreatech_article/table.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,27 @@ def upload_image(s3, file_name: str, image: bytes) -> str:
105105
print(e)
106106

107107
return f'{S3_CONFIG["upload_domain"]}/{file_name}'
108+
109+
110+
def upload_txt(file_name: str, text_content: str) -> str:
111+
s3 = boto3.client(
112+
service_name='s3',
113+
aws_access_key_id=S3_CONFIG['aws_access_key_id'],
114+
aws_secret_access_key=S3_CONFIG['aws_secret_access_key'],
115+
)
116+
encoded_text = text_content.encode('utf-8')
117+
118+
try:
119+
s3.upload_fileobj(
120+
Fileobj=BytesIO(encoded_text),
121+
Bucket=S3_CONFIG['bucket'],
122+
Key=file_name,
123+
ExtraArgs={
124+
'ContentType': 'text/plain; charset=utf-8',
125+
'ACL': 'public-read'
126+
}
127+
)
128+
except Exception as e:
129+
print(e)
130+
131+
return f'{S3_CONFIG["upload_domain"]}/{file_name}'

0 commit comments

Comments
 (0)