SemanticLayerTools/src/semanticlayertools/clustering/leiden.py at 644dd31507b5692774060b9186cdd7187b32f46c · DHCodeReview/SemanticLayerTools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import time
import re
from tqdm import tqdm

import igraph as ig
import leidenalg as la


class TimeCluster():
    """Cluster time-sliced data with the Leiden algorithm.

    Calculates temporal clusters of e.g. time-sliced cocitation or citation
    data, using the Leiden algorithm . Two nodes are assumed to be identical in
    different year slices, if the node name is the same.
    This could be e.g. the bibcode or DOI.

    Input files are assumed to include the year in the filename, have an ending
    `_GC.net` to denote their giant component character and should be in Pajek
    format.

    The resolution parameter can be seen as a limiting density, above
    which neighbouring nodes are considered a cluster. The interslice coupling
    describes the influcence of yearly order on the clustering process. See doc
    for the Leiden algorithm for more detailed info.

    :param inpath: Path for input network data
    :type inpath: str
    :param outpath: Path for writing output data
    :type outpath: str
    :param resolution: Main parameter for the clustering quality function (Constant Pots Model)
    :type resolution: float
    :param intersliceCoupling: Coupling parameter between two year slices, also influences cluster detection
    :type intersliceCoupling: float
    :param timerange: The time range for considering input data (default=1945,2005))
    :type timerange: tuple
    :raises OSError: If the output file already exists at class instantiation

    .. seealso::
       Traag, V.A., Waltman. L., Van Eck, N.-J. (2018).
       From Louvain to Leiden: guaranteeing well-connected communities.
       Scientific reports, 9(1), 5233. 10.1038/s41598-019-41695-z
    """

    def __init__(
        self, inpath: str, outpath: str,
        resolution: float = 0.003,
        intersliceCoupling: float = 0.4,
        timerange: tuple = (1945, 2005),
        useGC: bool = True,
    ):
        starttime = time.time()
        self.inpath = inpath
        self.outpath = outpath
        self.res_param = resolution
        self.interslice_param = intersliceCoupling
        self.timerange = timerange

        self.outfile = os.path.join(
            outpath,
            f'timeclusters_{timerange[0]}-{timerange[1]}_res_{resolution}_intersl_{intersliceCoupling}.csv'
        )
        if os.path.isfile(self.outfile):
            raise OSError(f'Output file at {self.outfile} exists. Aborting.')

        if useGC is True:
            edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')]
        elif useGC is False:
            edgefiles = [x for x in os.listdir(inpath) if x.endswith('.ncol')]

        self.graphDict = {}

        for idx in tqdm(range(len(edgefiles)), leave=False):
            try:
                year = re.findall(r'\d{4}', edgefiles[idx])[0]
            except Exception:
                raise
            if timerange[0] <= int(year) <= timerange[1]:
                if useGC is True:
                    graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx]))
                elif useGC is False:
                    graph = ig.Graph.Read_Ncol(os.path.join(inpath, edgefiles[idx]))
                self.graphDict[year] = graph

        self.optimiser = la.Optimiser()

        print(
            "Graphs between "
            f"{min(list(self.graphDict.keys()))} and "
            f"{max(list(self.graphDict.keys()))} "
            f"loaded in {time.time() - starttime} seconds."
        )

    def optimize(self, clusterSizeCompare: int = 1000):
        """Optimize clusters accross time slices.

        This runs the actual clustering and can be very time and memory
        consuming for large networks. Depending on the obtained cluster results,
        this method has to be run iteratively with varying resolution parameter.
        Output is written to file, with filename containing chosen parameters.

        The output CSV contains information on which node in which year belongs
        to which cluster. As a first measure of returned clustering, the method
        prints the number of clusters found above a threshold defined by
        `clusterSizeCompare`. This does not influence the output clustering.

        :param clusterSizeCompare: Threshold for `interesting` clusters
        :type clusterSizeCompare: int
        :returns: Tuple of output file path and list of found clusters in tuple format (node, year, cluster)
        :rtype: tuple

        .. seealso::
           Documentation of time-layer creation routine:
           `Leiden documentation <https://leidenalg.readthedocs.io/en/latest/multiplex.html#temporal-community-detection>`_
        """
        starttime = time.time()

        layers, interslice_layer, _ = la.time_slices_to_layers(
            list(self.graphDict.values()),
            interslice_weight=self.interslice_param,
            vertex_id_attr='name'
        )
        print('\tSet layers.')

        partitions = [
            la.CPMVertexPartition(
                H,
                node_sizes='node_size',
                weights='weight',
                resolution_parameter=self.res_param
            ) for H in layers
        ]
        print('\tSet partitions.')

        interslice_partition = la.CPMVertexPartition(
            interslice_layer,
            resolution_parameter=0,
            node_sizes='node_size',
            weights='weight'
        )
        print('\tSet interslice partions.')

        self.optimiser.optimise_partition_multiplex(
            partitions + [interslice_partition]
        )

        subgraphs = interslice_partition.subgraphs()

        commun = []
        for idx, part in enumerate(subgraphs):
            nodevals = [
                (
                    x['name'],
                    list(self.graphDict.keys()).pop(x['slice']),
                    idx
                ) for x in part.vs
            ]
            commun.extend(nodevals)

        with open(self.outfile, 'w') as outfile:
            outfile.write('node,year,cluster\n')
            for elem in commun:
                outfile.write(
                    f"{elem[0]},{elem[1]},{elem[2]}\n"
                )
        largeclu = [
            (x, len(x.vs)) for x in subgraphs if len(x.vs) > clusterSizeCompare
        ]
        print(
            f'Finished in {time.time() - starttime} seconds.'
            f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes."
        )

        return self.outfile, commun