Skip to content

Commit fd48d2b

Browse files
authored
Merge pull request #176 from BCSDLab/feature/convert_article_content_to_s3_url
feat: article content s3 업로드 및 url로 변경
2 parents 48c6808 + a447dc6 commit fd48d2b

2 files changed

Lines changed: 102 additions & 0 deletions

File tree

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import pymysql
2+
import urllib3
3+
import uuid
4+
5+
from crawling.config import MYSQL_CONFIG
6+
from table import upload_txt
7+
8+
9+
def connect_db():
10+
urllib3.disable_warnings()
11+
conn = pymysql.connect(host=MYSQL_CONFIG['host'],
12+
port=MYSQL_CONFIG['port'],
13+
user=MYSQL_CONFIG['user'],
14+
password=MYSQL_CONFIG['password'],
15+
db=MYSQL_CONFIG['db'],
16+
charset='utf8')
17+
return conn
18+
19+
20+
def convert_content_to_url(connection):
21+
cur = connection.cursor()
22+
batch_size = 500
23+
last_id = 0
24+
total_articles = 0
25+
26+
while True:
27+
try:
28+
cur.execute("""
29+
SELECT `id`, `board_id`, `content`
30+
FROM `new_articles`
31+
WHERE `content` IS NOT NULL
32+
AND `board_id` != 14
33+
AND `id` > %s
34+
ORDER BY `id` ASC
35+
LIMIT %s
36+
""", (last_id, batch_size))
37+
38+
articles = cur.fetchall()
39+
if not articles:
40+
break
41+
42+
for article in articles:
43+
article_id, board_id, content = article
44+
random_uuid = str(uuid.uuid4().hex)
45+
file_name = f'articles/content/board_{board_id}/{random_uuid}.txt'
46+
content_url = upload_txt(file_name=file_name, text_content=content)
47+
48+
update_cur = connection.cursor()
49+
update_cur.execute("""
50+
UPDATE `new_articles`
51+
SET `content` = %s
52+
WHERE `id` = %s
53+
""", (content_url, article_id))
54+
print(f"article {article_id} url: {content_url}")
55+
update_cur.close()
56+
57+
total_articles += 1
58+
59+
last_id = articles[-1]['id']
60+
61+
connection.commit()
62+
63+
except Exception as error:
64+
connection.rollback()
65+
print(error)
66+
finally:
67+
cur.close()
68+
print(f"total articles: {total_articles}")
69+
70+
71+
if __name__ == "__main__":
72+
connection = None
73+
try:
74+
connection = connect_db()
75+
convert_content_to_url(connection)
76+
except Exception as error:
77+
print(error)
78+
finally:
79+
connection.close()

crawling/koreatech_article/table.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,26 @@ def upload_image(s3, file_name: str, image: bytes) -> str:
105105
print(e)
106106

107107
return f'{S3_CONFIG["upload_domain"]}/{file_name}'
108+
109+
def upload_txt(file_name: str, text_content: str) -> str:
110+
s3 = boto3.client(
111+
service_name='s3',
112+
aws_access_key_id=S3_CONFIG['aws_access_key_id'],
113+
aws_secret_access_key=S3_CONFIG['aws_secret_access_key'],
114+
)
115+
encoded_text = text_content.encode('utf-8')
116+
117+
try:
118+
s3.upload_fileobj(
119+
Fileobj=BytesIO(encoded_text),
120+
Bucket=S3_CONFIG['bucket'],
121+
Key=file_name,
122+
ExtraArgs={
123+
'ContentType': 'text/plain; charset=utf-8',
124+
'ACL': 'public-read'
125+
}
126+
)
127+
except Exception as e:
128+
print(e)
129+
130+
return f'{S3_CONFIG["upload_domain"]}/{file_name}'

0 commit comments

Comments
 (0)