一个小工具,发现有一个开源的诗词库( https://github.com/michaelliao/shici/tree/master/web/src/main/resources/text ),考虑导入到数据库中,诗词库是通过文件夹方式组织,所以用python发现很快的实现这个需求,主要功能包括递归枚举文件夹,然后将文件内容读出来写到数据库中。
代码记录如下:
代码语言:javascript复制#导入依赖库
import os,sys
from datetime import datetime
import pymysql
def readpoem(path):
str=''
f = open(path, 'r')
line = f.readline()
while line:
oldline = line
line = f.readline()
if 'form' in oldline:
continue
if 'tags' in oldline:
continue
if oldline.strip()=='':
continue
#oldline=oldline.strip('n')
str = oldline
f.close()
return str
def readmeta(path):
str=''
f = open(path, 'r')
line = f.readline()
while line:
oldline = line
line = f.readline()
if 'birth' in oldline:
continue
if 'death' in oldline:
continue
if oldline.strip()=='':
continue
oldline=oldline.strip('n')
str = oldline
f.close()
return str
def listdir(path, list_result):
result={}
containFile=False
for file in os.listdir(path):
file_path = os.path.join(path, file)
if os.path.isdir(file_path):
listdir(file_path, list_result)
elif os.path.splitext(file_path)[1]=='.txt':
if 'meta' in file_path:
meta = readmeta(file_path)
if meta.strip()!='':
result['meta']=meta
else:
#filepath,fullflname = os.path.split(file_path)
#fname,ext = os.path.splitext(fullflname)
containFile=True
content = readpoem(file_path)
if result.get('poem', None) is None:
result['poem']=[]
poem_detail={}
if content.strip()!='':
poem_detail['content'] =content.replace(''', '')
#Get the poem name
poem_name=os.path.splitext(file_path)[0]
poem_temp_array = poem_name.split('/')
poem_titles = poem_temp_array[-3:]
index = 0
for title in poem_titles:
if poem_detail.get('title', None) is None:
poem_detail['title'] = '[{0}]'.format(title.split('.')[1].replace(''', ''))
else:
if index == 1:
poem_detail['title'] = title.strip()
else:
poem_detail['title'] = '-{0}'.format(title.strip())
index = 1
#wirite the poem detail info
result['poem'].append(poem_detail);
#print('name:{0}, content:{1}'.format(poem_detail['title'], poem_detail['content']))
#list_result.append(file_path)
else:
pass
if containFile:
print('poem.numb:{0}'.format(len(result['poem'])))
else:
pass
if len(result) >0:
list_result.append(result)
else:
pass
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
result_poems=[]
listdir(sys.path[0], result_poems)
print("size:%d" %len(result_poems))
#for item in result_file:
# print(item.get('meta', ''))
# for poem in item['poem']:
# print('name:{0}, content:{1}'.format(poem['title'], poem['content']))
#写入数据库
#from datetime import datetime
#connect db
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='python', db='myschool', charset='utf8')
cur = conn.cursor()
dt=datetime.now()
now = dt.strftime( '%Y%m%d%H' )
#注意转义符
sql = "insert into shici(`name`,`content`,`time`) values ('{0}', '{1}', {2})"
#for item in result['poem']:
# insert_sql = match.format(item['title'], item['content'], )
#conn.close()
try:
for item in result_poems:
print(item.get('meta', ''))
for poem in item['poem']:
print('name:{0}, content:{1}'.format(poem['title'], poem['content']))
insert_sql = sql.format(poem['title'].encode('utf-8'), poem['content'].encode('utf-8'), now)
cur.execute(insert_sql)
conn.commit()
except:
conn.rollback()
conn.commit()
conn.close()