stats のソースコード

import os
import codecs

root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))).replace('\\', '/')
data_path    = root_dir + '/python/data'

[ドキュメント]def dump_ele(tree_f, max_len_dic, ele, nest, processed):
    if not ele in max_len_dic.keys():
        return len(processed)

    if ele in processed:
        if tree_f is not None:
            tree_f.write("%s%s %s *\n" % ("    "*nest, ele.verbose_label, ele.id))
    else:
        processed.add(ele)
        if tree_f is not None:
            tree_f.write("%s%s %s\n" % ("    "*nest, ele.verbose_label, ele.id))

        for x in ele.child_elements:
            dump_ele(tree_f, max_len_dic, x, nest + 1, processed)

    return len(processed)


[ドキュメント]def set_max_len_parent2(top_items, max_len_dic, ele):
    if len(ele.parents) == 0:
        top_items.add(ele)
        return 0

    if len(ele.parents) == 1:
        return set_max_len_parent(top_items, max_len_dic, ele.parents[0]) + 1

    parent_lens = [ set_max_len_parent(top_items, max_len_dic, x) for x in ele.parents ]
    max_len  = max(parent_lens)

    for parent, parent_len in zip(list(ele.parents), parent_lens):
        if parent_len < max_len:
            assert parent in ele.parents
            ele.parents.remove(parent)

            assert ele in parent.child_elements
            parent.child_elements.remove(ele)

    return max_len + 1

[ドキュメント]def set_max_len_parent(top_items, max_len_dic, ele):
    if ele in max_len_dic:
        return max_len_dic[ele]

    max_len = set_max_len_parent2(top_items, max_len_dic, ele)

    max_len_dic[ele] = max_len

    return max_len



[ドキュメント]def write_calc_tree(context_names, ns_xsd_dic, annual_account_stats, quarterly_account_stats, rank = 200):
    instant_account_dic  = set()
    duration_account_dic = set()

    # 報告書の種類ごとに
    for report_name, account_stats in zip([ "有価証券報告書", "四半期報告書" ], [annual_account_stats, quarterly_account_stats]):

        # 会計基準ごとに
        for accounting_standard, stats in account_stats.items():

            # コンテキストの種類ごとに
            for idx, context_name in enumerate(context_names):
                if not context_name.startswith('CurrentYear'):
                    continue

                if 'Instant' in context_name:
                    # 時点の場合

                    dic = instant_account_dic
                else:
                    # 期間の場合
                    assert 'Duration' in context_name

                    dic = duration_account_dic

                counts = list(sorted(stats[idx].items(), key=lambda x:x[1], reverse=True))

                # 頻出上位の項目のみ使う。
                counts = counts[: rank]

                for count in counts:
                    dic.add(count[0])


    instant_account_ids = list(instant_account_dic)
    duration_account_ids = list(duration_account_dic)

    ns_xsd_dic2 = {}
    for ns, dic in ns_xsd_dic.items():
        dic2 = {}
        ns_xsd_dic2[ns] = dic2

        for key, ele in dic.items():
            if key != ele.id:
                continue

            dic2[key] = ele

            if len(ele.calcTo) != 0:
                ele.calcTo = sorted(ele.calcTo, key=lambda x: x.order)
                ele.child_elements = [x.to for x in ele.calcTo]

                for x in ele.child_elements:
                    assert not ele in x.parents
                    x.parents.append(ele)        

    tree_f = codecs.open("%s/calc_tree.txt" % data_path, 'w', 'utf-8')

    for ids, context_name in zip([ instant_account_ids, duration_account_ids ], [ "会計終了時点", "会計期間" ]):
        tree_f.write("\n%s\nコンテスト : %s\n%s\n" % ('-'*80, context_name, '-'*80) )

        all_items = []

        for id in ids:

            ns, tag_name = id.split(':')

            # 名前空間に対応するスキーマの辞書を得る。
            assert ns in ns_xsd_dic2
            xsd_dic = ns_xsd_dic2[ns]

            # タグ名に対応する要素を得る。
            if not tag_name in xsd_dic:
                tag_name = ns + "_" + tag_name
            assert tag_name in xsd_dic
            ele =xsd_dic[tag_name]

            if ele.type in ['stringItemType', 'textBlockItemType', 'dateItemType']:
                continue

            if not ele in all_items:
                all_items.append(ele)

        top_items = set()
        max_len_dic = {}
        for ele in all_items:
            set_max_len_parent(top_items, max_len_dic, ele)

        top_cnts = [ [ele, dump_ele(None, max_len_dic, ele, 0, set())] for ele in top_items ]

        top_cnts = sorted(top_cnts, key=lambda x:x[1], reverse=True)

        for ele, cnt in top_cnts:
            dump_ele(tree_f, max_len_dic, ele, 0, set())

    tree_f.close()