|
1 | 1 | package io.github.david0x03.metrics; |
2 | 2 |
|
3 | | -import io.github.david0x03.Database; |
4 | | -import io.github.david0x03.FeatureUtils; |
5 | | -import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; |
6 | | -import org.apache.log4j.LogManager; |
7 | | -import org.apache.log4j.Logger; |
8 | | - |
9 | 3 | import java.util.HashMap; |
10 | 4 | import java.util.List; |
11 | 5 |
|
| 6 | +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; |
| 7 | +import org.apache.logging.log4j.LogManager; |
| 8 | +import org.apache.logging.log4j.Logger; |
| 9 | + |
| 10 | +import io.github.david0x03.Database; |
| 11 | +import io.github.david0x03.FeatureUtils; |
| 12 | + |
12 | 13 | /** |
13 | 14 | * Calculates basic metrics and feature distributions for mined repositories. |
14 | | - * Includes statistics for successfully mined repositories, rejected repositories, and |
| 15 | + * Includes statistics for successfully mined repositories, rejected |
| 16 | + * repositories, and |
15 | 17 | * feature distributions across repositories. |
16 | 18 | */ |
17 | 19 | public class BasicMetrics { |
18 | 20 |
|
19 | | - private final Logger logger = LogManager.getLogger(BasicMetrics.class); |
20 | | - |
21 | | - private final List<String> mainFeatures; |
22 | | - |
23 | | - /** |
24 | | - * Initializes the BasicMetrics object and logs statistics for mined repositories. |
25 | | - * It categorizes repositories into successfully mined, Android projects, and failed cases. |
26 | | - */ |
27 | | - public BasicMetrics() { |
28 | | - var minedRepos = Database.getAllMinedRepos(); |
29 | | - var distinctFeatures = Database.getDistinctFeatures(); |
30 | | - |
31 | | - // Extract main categories of features |
32 | | - mainFeatures = distinctFeatures.stream().map(FeatureUtils::getSecurityFeatureMainCategory).distinct().toList(); |
33 | | - |
34 | | - int androidProjects = 0; |
35 | | - int successfullyMined = 0; |
36 | | - |
37 | | - for (var minedRepo : minedRepos) { |
38 | | - if (minedRepo.note() == null) |
39 | | - successfullyMined++; |
40 | | - else if (minedRepo.note().equals("android project")) |
41 | | - androidProjects++; |
42 | | - } |
43 | | - |
44 | | - int failedToMine = minedRepos.size() - successfullyMined - androidProjects; |
45 | | - |
46 | | - logger.info("Total mined repositories: " + minedRepos.size()); |
47 | | - logger.info("Rejected: " + (minedRepos.size() - successfullyMined) + " (android: " + androidProjects + " | failed: " + failedToMine + ")"); |
48 | | - logger.info("Successfully mined: " + successfullyMined); |
49 | | - } |
50 | | - |
51 | | - /** |
52 | | - * Calculates the distribution of security features across repositories. |
53 | | - * Excludes outliers based on the 5th and 95th percentiles. |
54 | | - * |
55 | | - * @return A map where keys are feature names and values are their average distributions as percentages. |
56 | | - */ |
57 | | - public HashMap<String, Double> calcDistribution() { |
58 | | - var featureDistribution = new HashMap<String, DescriptiveStatistics>(); |
59 | | - mainFeatures.forEach(f -> featureDistribution.put(f, new DescriptiveStatistics())); |
60 | | - |
61 | | - var featureCounts = Database.getFeatureCountsByRepoAndFile(); |
62 | | - var fileCounts = Database.getFileCounts(); |
63 | | - |
64 | | - // Calculate feature occurrences across repositories |
65 | | - featureCounts.forEach((minedRepoId, counts) -> { |
66 | | - for (var feature : mainFeatures) { |
67 | | - var occurrences = counts.stream().filter(fCount -> { |
68 | | - var f = FeatureUtils.getSecurityFeatureMainCategory(fCount.feature()); |
69 | | - return f.equals(feature); |
70 | | - }).toList(); |
71 | | - |
72 | | - if (occurrences.isEmpty()) continue; |
73 | | - var zeroRatio = (double) occurrences.size() / (double) fileCounts.get(minedRepoId); |
74 | | - |
75 | | - featureDistribution.get(feature).addValue(zeroRatio); |
76 | | - } |
77 | | - }); |
78 | | - |
79 | | - // Compute average distributions excluding outliers |
80 | | - var averages = new HashMap<String, Double>(); |
81 | | - featureDistribution.forEach((feature, stats) -> { |
82 | | - |
83 | | - double lowerPercentile = stats.getPercentile(5); |
84 | | - double upperPercentile = stats.getPercentile(95); |
85 | | - |
86 | | - double sum = 0; |
87 | | - int count = 0; |
88 | | - for (double value : stats.getValues()) { |
89 | | - if (value >= lowerPercentile && value <= upperPercentile) { |
90 | | - sum += value; |
91 | | - count++; |
92 | | - } |
93 | | - } |
94 | | - |
95 | | - double meanWithoutOutliers = sum / count * 100; |
96 | | - averages.put(feature, meanWithoutOutliers); |
97 | | - }); |
98 | | - |
99 | | - return averages; |
100 | | - } |
| 21 | + private final Logger logger = LogManager.getLogger(BasicMetrics.class); |
| 22 | + |
| 23 | + private final List<String> mainFeatures; |
| 24 | + |
| 25 | + /** |
| 26 | + * Initializes the BasicMetrics object and logs statistics for mined |
| 27 | + * repositories. |
| 28 | + * It categorizes repositories into successfully mined, Android projects, and |
| 29 | + * failed cases. |
| 30 | + */ |
| 31 | + public BasicMetrics() { |
| 32 | + final var minedRepos = Database.getAllMinedRepos(); |
| 33 | + final var distinctFeatures = Database.getDistinctFeatures(); |
| 34 | + |
| 35 | + // Extract main categories of features |
| 36 | + this.mainFeatures = distinctFeatures.stream().map(FeatureUtils::getSecurityFeatureMainCategory).distinct() |
| 37 | + .toList(); |
| 38 | + |
| 39 | + var androidProjects = 0; |
| 40 | + var successfullyMined = 0; |
| 41 | + |
| 42 | + for (final var minedRepo : minedRepos) { |
| 43 | + if (minedRepo.note() == null) { |
| 44 | + successfullyMined++; |
| 45 | + } else if (minedRepo.note().equals("android project")) { |
| 46 | + androidProjects++; |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + final var failedToMine = minedRepos.size() - successfullyMined - androidProjects; |
| 51 | + |
| 52 | + this.logger.info("Total mined repositories: " + minedRepos.size()); |
| 53 | + this.logger.info("Rejected: " + (minedRepos.size() - successfullyMined) + " (android: " + androidProjects |
| 54 | + + " | failed: " + failedToMine + ")"); |
| 55 | + this.logger.info("Successfully mined: " + successfullyMined); |
| 56 | + } |
| 57 | + |
| 58 | + /** |
| 59 | + * Calculates the distribution of security features across repositories. |
| 60 | + * Excludes outliers based on the 5th and 95th percentiles. |
| 61 | + * |
| 62 | + * @return A map where keys are feature names and values are their average |
| 63 | + * distributions as percentages. |
| 64 | + */ |
| 65 | + public HashMap<String, Double> calcDistribution() { |
| 66 | + final var featureDistribution = new HashMap<String, DescriptiveStatistics>(); |
| 67 | + this.mainFeatures.forEach(f -> featureDistribution.put(f, new DescriptiveStatistics())); |
| 68 | + |
| 69 | + final var featureCounts = Database.getFeatureCountsByRepoAndFile(); |
| 70 | + final var fileCounts = Database.getFileCounts(); |
| 71 | + |
| 72 | + // Calculate feature occurrences across repositories |
| 73 | + featureCounts.forEach((minedRepoId, counts) -> { |
| 74 | + for (final var feature : this.mainFeatures) { |
| 75 | + final var occurrences = counts.stream().filter(fCount -> { |
| 76 | + final var f = FeatureUtils.getSecurityFeatureMainCategory(fCount.feature()); |
| 77 | + return f.equals(feature); |
| 78 | + }).toList(); |
| 79 | + |
| 80 | + if (occurrences.isEmpty()) { |
| 81 | + continue; |
| 82 | + } |
| 83 | + final var zeroRatio = (double) occurrences.size() / (double) fileCounts.get(minedRepoId); |
| 84 | + |
| 85 | + featureDistribution.get(feature).addValue(zeroRatio); |
| 86 | + } |
| 87 | + }); |
| 88 | + |
| 89 | + // Compute average distributions excluding outliers |
| 90 | + final var averages = new HashMap<String, Double>(); |
| 91 | + featureDistribution.forEach((feature, stats) -> { |
| 92 | + |
| 93 | + final var lowerPercentile = stats.getPercentile(5); |
| 94 | + final var upperPercentile = stats.getPercentile(95); |
| 95 | + |
| 96 | + var sum = 0D; |
| 97 | + var count = 0; |
| 98 | + for (final double value : stats.getValues()) { |
| 99 | + if (value >= lowerPercentile && value <= upperPercentile) { |
| 100 | + sum += value; |
| 101 | + count++; |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + final var meanWithoutOutliers = sum / count * 100; |
| 106 | + averages.put(feature, meanWithoutOutliers); |
| 107 | + }); |
| 108 | + |
| 109 | + return averages; |
| 110 | + } |
101 | 111 | } |
0 commit comments