from graph import Graph import matplotlib.pyplot as plt import datetime import os import re class Ratio(Graph): def __init__(self): super().__init__('ratio') self.data = { 'monthly': {}, # { country: num } 'total': {}, # { country: num } 'us': {}, # { state: num } 'notUs': {}, # { country: num } } self.cur = datetime.datetime.now() self.fmtRe = re.compile(r'0*e') # remove trailing zeros in legend self.seen = set() def process(self, ip, time, request, size, location, log): con = location.country.name if location else 'Unknown' self.inc(self.data['total'], con) if time.year == self.cur.year and time.month == self.cur.month: # monthly self.inc(self.data['monthly'], con) if con == 'United States': # us if len(location.subdivisions) > 0: state = location.subdivisions[0].names['en'] # state should always be the most general subdivision else: state = 'Unknown' self.inc(self.data['us'], state) if con != 'United States': # not us self.inc(self.data['notUs'], con) return [] def draw(self, path): print('Generating download ratios...') self.toPie(path, self.data['total'], 'Top Downloaders by Count', 'download_ratio_total.png') self.toPie(path, self.data['us'], 'Top Downloaders in the United States by Count', 'download_ratio_us_total.png', colorCategory='state') self.toPie(path, self.data['notUs'], 'Top Downloaders Excluding the United States by Count', 'download_ratio_not_us_total.png', bbox=(1.15, 1)) # since we dont actually display monthly data series, just generate the current months series and don't bother with generate over the entire recorded period # each monthly ratio has a unique name so we dont override monthlyPath = os.path.join(path, 'monthly') if not os.path.isdir(monthlyPath): os.mkdir(monthlyPath) title = f"Top Monthly Downloaders ({self.cur.strftime('%B')} {self.cur.year})" file = f"download_ratio_monthly_{self.cur.strftime('%b').lower()}_{self.cur.year}.png" self.toPie(monthlyPath, self.data['monthly'], title, file) def toPie(self, path, series, title, filename, bbox=(1,1), colorCategory='country'): if len(series) == 0: print(f'Could not plot {title} due to no data') return # only take top 10 countries or until percent < 0.01%, coalesce everything else into 'Other' numPlot = 10 inOrder = sorted(series.items(), key=lambda k: k[1], reverse=True) m = sum(series.values()) top = [] for i in range(min(len(inOrder), numPlot)): if inOrder[i][1] / m < 0.0001: break top.append(inOrder[i]) if len(series) > len(top): other = sum(x[1] for x in (inOrder[len(top):])) top.append(('Other', other)) labels, val = zip(*top) fig, ax = plt.subplots(figsize=(12.8, 9.6)) p, t, at = ax.pie(val, labels=labels, autopct='%1.1f%%', wedgeprops={'alpha': 0.75}, colors=self.getColors(labels, colorCategory)) # do not display anything smaller than 2.5% on main graph for label, pc in zip(t, at): if float(pc.get_text().rstrip('%')) < 2.5: label.set_text('') pc.set_text('') # annotate legend with percent values # https://stackoverflow.com/questions/23577505/how-to-avoid-overlapping-of-labels-autopct-in-a-pie-chart l_labels = [f'{l} ({round(100 * v / m, 2)}%)' for l, v in top] # other should be at very bottom (yes this is mildly scuffed) p, l_labels, _ = zip(*sorted(zip(p, l_labels, val), key=lambda x: -m if x[1].startswith('Other') else x[2], reverse=True)) ax.set_title(title, fontsize=14) plt.legend(p, l_labels, loc='upper left', fontsize=12, bbox_to_anchor=bbox) plt.savefig(os.path.join(path, filename), bbox_inches='tight', dpi=100)