Initially I removed local files like js, css. So I convert all html files to md. I made a fork where I'm making necessary changes, don't use this code.
#!/bin/bash
import os
import html2markdown
from bs4 import BeautifulSoup, Doctype
# reset
# find . -name \*.css -type f -delete
# find . -name \*.icon -type f -delete
# find . -name \*.ico -type f -delete
# find . -name \*.js -type f -delete
# find . -name \*.png -type f -delete
# find . -name \*.svg -type f -delete
# find . -name \*.jpeg -type f -delete
# find . -name \*.jpg -type f -delete
# find . -name \*.jfif -type f -delete
# find . -name \*.json -type f -delete
# find . -name \*.gif -type f -delete
directory = './data_sets/'
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith('.html'):
fname = os.path.join(root, filename)
print('Filename: {}'.format(fname))
with open(fname) as handle:
soup = BeautifulSoup(handle.read(), 'html.parser')
for item in soup.contents:
if isinstance(item, Doctype):
print('Doctype: {}'.format(item))
break
So, please see this: Documentation
Initially I removed local files like js, css. So I convert all html files to md. I made a fork where I'm making necessary changes, don't use this code.
So, please see this: Documentation