|
| 1 | +#!/usr/bin/python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Created by PyCharm. |
| 5 | +File Name: LinuxBashShellScriptForOps:convert-chinese-to-pinyin.py |
| 6 | +Version: 0.0.1 |
| 7 | +Author: dgden |
| 8 | +Author Email: dgdenterprise@gmail.com |
| 9 | +URL: https://github.com/DingGuodong/LinuxBashShellScriptForOps |
| 10 | +Download URL: https://github.com/DingGuodong/LinuxBashShellScriptForOps/tarball/master |
| 11 | +Create Date: 2021/3/16 |
| 12 | +Create Time: 14:30 |
| 13 | +Description: Convert Chinese characters to Pinyin |
| 14 | +Long Description: |
| 15 | +References: |
| 16 | +Prerequisites: pip install pypinyin |
| 17 | + pip install jieba |
| 18 | +Development Status: 3 - Alpha, 5 - Production/Stable |
| 19 | +Environment: Console |
| 20 | +Intended Audience: System Administrators, Developers, End Users/Desktop |
| 21 | +License: Freeware, Freely Distributable |
| 22 | +Natural Language: English, Chinese (Simplified) |
| 23 | +Operating System: POSIX :: Linux, Microsoft :: Windows |
| 24 | +Programming Language: Python :: 2.6 |
| 25 | +Programming Language: Python :: 2.7 |
| 26 | +Topic: Utilities |
| 27 | + """ |
| 28 | +import jieba |
| 29 | +from pypinyin import lazy_pinyin |
| 30 | + |
| 31 | +# company_type_set = { |
| 32 | +# u"股份有限公司", |
| 33 | +# u"有限责任公司" |
| 34 | +# u"有限公司", |
| 35 | +# } |
| 36 | + |
| 37 | +company_type_list = [u"公司", u"有限", u"责任", u"股份"] |
| 38 | +biz_type_list = [u"保险代理", u"保险经纪", u"保险", ] |
| 39 | + |
| 40 | + |
| 41 | +def remove_company_type(company_name): |
| 42 | + for company_type in company_type_list: |
| 43 | + company_name = company_name.replace(company_type, u"") |
| 44 | + return company_name |
| 45 | + |
| 46 | + |
| 47 | +def remove_biz_type(company_name): |
| 48 | + for biz_type in biz_type_list: |
| 49 | + company_name = company_name.replace(biz_type, u"") |
| 50 | + return company_name |
| 51 | + |
| 52 | + |
| 53 | +def get_company_name_and_brief(company_name): |
| 54 | + jieba_list = list(jieba.cut(company_name)) |
| 55 | + |
| 56 | + if 2 >= len(jieba_list) >= 1: |
| 57 | + company_name_brief = jieba_list[0] |
| 58 | + elif 3 >= len(jieba_list) >= 2: |
| 59 | + company_name_brief = u"".join(jieba_list[0:1]) |
| 60 | + elif 5 >= len(jieba_list) >= 3: |
| 61 | + company_name_brief = u"".join(jieba_list[0:2]) |
| 62 | + else: |
| 63 | + company_name_brief = u"".join(jieba_list[0:3]) |
| 64 | + |
| 65 | + company_name_brief = remove_biz_type(company_name_brief) |
| 66 | + |
| 67 | + print company_name.encode("utf-8"), company_name_brief.encode("utf-8"), "".join(lazy_pinyin(company_name_brief)) |
| 68 | + |
| 69 | + |
| 70 | +if __name__ == '__main__': |
| 71 | + company_name_list = [ |
| 72 | + u"山东大有保险代理股份有限公司", # test case |
| 73 | + u"升宏保险代理有限公司", |
| 74 | + u"安信联合保险经纪有限公司", |
| 75 | + u"重庆恒蕴汽车保险代理有限公司", |
| 76 | + u"黑龙江善邦保险代理有限公司", |
| 77 | + u"国泰家和保险代理有限公司", # "和" |
| 78 | + u"北京易才宏业保险经纪有限公司" |
| 79 | + ] |
| 80 | + for com_name in company_name_list: |
| 81 | + get_company_name_and_brief(com_name) |
0 commit comments