This repository contains the complete supplementary package for the malware-scanner study in the Algorithms journal. The archive includes three Python implementations that reflect the evolution of the scanner across the stages described in the article. The Minimal Malware scanner.py file shows the core Aho-Corasick automaton with a direct construction path and a simple pattern-match procedure, and serves as a didactic reference for readers who wish to observe the essential behavior of the automaton with no auxiliary components. The Intermediary Malware scanner.py file introduces input checks, signature normalization, and clear state-transition output that exposes the internal mechanics of the automaton. The Complete Malware scanner.py file provides the full version of the scanner with robust input validation, consistent failure-output propagation, and diagnostic messages that trace valid and invalid patterns; this version reflects the model used in the article. The main.db file is the signature database required by all three implementations, with each entry expressed in the format name=hex_pattern.
The algorithms-18-00742.pdf file provides the published article for reference, and the screenshot.png file offers a visual example of the automaton structure shown in Appendix A. This repository thus provides a transparent and reproducible supplement that demonstrates the link between the theoretical structure of the Aho-Corasick automaton and its role in a practical malware-signature scanner.
Note: This project is part of a complementary toolkit.
You may also want to check out:
import os
import re
import tkinter as t
from tkinter import filedialog as f, scrolledtext as s, ttk as k, messagebox as m
import time
# =========================
# Aho–Corasick structures
# =========================
class N:
def __init__(a):
a.c = {}
a.o = []
a.f = None
def b(p):
r = N()
for q in p:
n = r
for x in q:
if x not in n.c:
n.c[x] = N()
n = n.c[x]
n.o.append(q)
return r
def a(y, r):
z = []
n = r
for i, x in enumerate(y):
while x not in n.c and n != r:
n = n.f
if x in n.c:
n = n.c[x]
for q in n.o:
z.append((i - len(q) + 1, q))
return z
def d(r):
q = []
for n in r.c.values():
n.f = r
q.append(n)
while q:
g = q.pop(0)
for x, s in g.c.items():
q.append(s)
f_ = g.f
while x not in f_.c and f_ != r:
f_ = f_.f
s.f = f_.c.get(x, r)
s.o += s.f.o # inherit outputs from failure links
# =========================
# Signature loading & validation
# =========================
def is_valid_token(tok):
"""Check if a token is a valid hex byte or wildcard."""
return tok == '??' or re.fullmatch(r'[0-9A-Fa-f]{2}', tok) is not None
def parse_signature_line(line):
"""Parse a signature line, accepting hex, ?? and * wildcards."""
if '=' not in line:
return None
name, raw = line.strip().split('=', 1)
# Accept only allowed characters (0-9, A-F, ?, *)
raw = raw.strip().upper().replace(' ', '')
if not re.fullmatch(r'[0-9A-F\?\*]+', raw):
return None
# Replace wildcards with a neutral byte (00) to allow Aho–Corasick build
cleaned = raw.replace('??', '00').replace('*', '')
if len(cleaned) % 2 != 0:
return None # Not a valid even-length hex string
return cleaned
def e(sf):
"""Load and validate .db signature file."""
p = []
skipped = 0
try:
if os.path.getsize(sf) == 0:
raise ValueError("Empty signature file.")
with open(sf, 'r', encoding='utf-8') as u:
for v in u:
if '=' in v:
parsed = parse_signature_line(v)
if parsed:
p.append(parsed)
else:
skipped += 1
if not p:
raise ValueError("No valid signatures loaded.")
r = b(p)
d(r)
msg = f"Loaded {len(p)} valid signatures"
if skipped:
msg += f", skipped {skipped} invalid lines."
m.showinfo("Signature File Loaded", msg)
return r, len(p)
except Exception as ex:
m.showerror("Error", f"Failed to load signature file:\n{ex}")
return None, 0
# =========================
# File scanning
# =========================
def h(fp, r, w=1024):
"""Scan a file using the constructed automaton."""
try:
with open(fp, 'rb') as u:
if os.path.getsize(fp) < w:
u.seek(0)
v = u.read()
else:
v = u.read(w)
u.seek(-w, os.SEEK_END)
v += u.read(w)
y = v.hex().upper()
z = a(y, r)
if z:
f_, g_ = z[0]
pos = f_ / (2 * len(v))
return True, f"\nFile: {fp}\nMalicious! ^\nSignature: {g_}\nLocation:", pos
return False, f"\nFile: {fp}\nClean file!\nLocation:", None
except Exception as s:
return False, f"\nFile: {fp}\nStatus: Error - {s}\n", None
# =========================
# GUI logic
# =========================
def j(pos, w=40):
if pos is None:
return "-" * w
q = int(w * pos)
return "-" * q + "+" + "-" * (w - q - 1)
def l():
A = None
B = 0
C = t.Tk()
C.title('Hex Signature Scanner (Gagniuc - Scut Antivirus)')
def L():
nonlocal A, B
sf = f.askopenfilename(
title="Select Signature File",
filetypes=(("Database files", "*.db"), ("All files", "*.*"))
)
if sf:
A, B = e(sf)
if A:
M.config(state=t.NORMAL)
D.config(text=f"Total Signatures: {B}")
def Q():
nonlocal A
if not A:
m.showinfo("Information", "Load signature file first.")
return
x = {y.strip() for y in G.get().split(',')}
if not x:
m.showinfo("Information", "Specify file extensions to scan.")
return
y = f.askdirectory()
if y:
I.delete(1.0, t.END)
F = []
for R, _, S in os.walk(y):
F.extend(
os.path.join(R, u)
for u in S
if any(u.lower().endswith(z.lower()) for z in x)
)
H['maximum'] = len(F)
ic = 0
cc = 0
start = time.time()
for fp in F:
inf, res, pos = h(fp, A)
gp = j(pos)
I.insert(t.END, f"{res} [{gp}]\n")
I.update_idletasks()
H['value'] += 1
H.update_idletasks()
if inf:
ic += 1
else:
cc += 1
end = time.time()
elapsed = end - start
J.config(
text=f"Scan Summary: {ic} Infected, {cc} Clean. Time Elapsed: {elapsed:.2f} sec"
)
K = t.Button(C, text="Load Signature", command=L)
K.pack(pady=10)
D = t.Label(C, text="Total Signatures: 0")
D.pack(pady=10)
G = t.Entry(C)
G.pack(pady=10)
G.insert(0, ".exe,.dll,.docx")
M = t.Button(C,
text="Browse Folder and Scan",
command=Q,
state=t.DISABLED)
M.pack(pady=10)
H = k.Progressbar(C,
orient="horizontal",
length=300,
mode="determinate")
H.pack(pady=10)
J = t.Label(C, text="Scan summary: Not started")
J.pack(pady=10)
I = s.ScrolledText(C,
wrap=t.WORD,
width=80,
height=20)
I.pack(pady=10, padx=10)
C.mainloop()
if __name__ == "__main__":
l()
# References
# Gagniuc, P.A.; Păvăloiu, I.B.; Dascălu, M.I. The Aho-Corasick Paradigm in Modern Antivirus
# Engines:A Cornerstone of Signature-Based Malware Detection. Algorithms 2025, 18, 742.Note: The repositories below showcase a few demo algorithmic implementations provided from the total of 127 algorithms presented in Antivirus Engines: From Methods to Innovations, Design, and Applications (Elsevier Syngress, 2024). You may find these standalone tools insightful:
- Gagniuc, P.A.; Păvăloiu, I.B.; Dascălu, M.I. The Aho-Corasick Paradigm in Modern Antivirus Engines: A Cornerstone of Signature-Based Malware Detection. Algorithms 2025, 18, 742.
- Paul A. Gagniuc. Antivirus Engines: From Methods to Innovations, Design, and Applications. Cambridge, MA: Elsevier Syngress, 2024. pp. 1-656.


