Skip to content

Gagniuc/Malware-Scanner

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

25 Commits
 
 
 
 
 
 
 
 

Repository files navigation

Malware Scanner

This repository contains the complete supplementary package for the malware-scanner study in the Algorithms journal. The archive includes three Python implementations that reflect the evolution of the scanner across the stages described in the article. The Minimal Malware scanner.py file shows the core Aho-Corasick automaton with a direct construction path and a simple pattern-match procedure, and serves as a didactic reference for readers who wish to observe the essential behavior of the automaton with no auxiliary components. The Intermediary Malware scanner.py file introduces input checks, signature normalization, and clear state-transition output that exposes the internal mechanics of the automaton. The Complete Malware scanner.py file provides the full version of the scanner with robust input validation, consistent failure-output propagation, and diagnostic messages that trace valid and invalid patterns; this version reflects the model used in the article. The main.db file is the signature database required by all three implementations, with each entry expressed in the format name=hex_pattern.

The algorithms-18-00742.pdf file provides the published article for reference, and the screenshot.png file offers a visual example of the automaton structure shown in Appendix A. This repository thus provides a transparent and reproducible supplement that demonstrates the link between the theoretical structure of the Aho-Corasick automaton and its role in a practical malware-signature scanner.


Note: This project is part of a complementary toolkit.
You may also want to check out:

Native implementation in Python

import os
import re
import tkinter as t
from tkinter import filedialog as f, scrolledtext as s, ttk as k, messagebox as m
import time


# =========================
# Aho–Corasick structures
# =========================
class N:
    def __init__(a):
        a.c = {}
        a.o = []
        a.f = None


def b(p):
    r = N()
    for q in p:
        n = r
        for x in q:
            if x not in n.c:
                n.c[x] = N()
            n = n.c[x]
        n.o.append(q)
    return r


def a(y, r):
    z = []
    n = r
    for i, x in enumerate(y):
        while x not in n.c and n != r:
            n = n.f
        if x in n.c:
            n = n.c[x]
        for q in n.o:
            z.append((i - len(q) + 1, q))
    return z


def d(r):
    q = []
    for n in r.c.values():
        n.f = r
        q.append(n)
    while q:
        g = q.pop(0)
        for x, s in g.c.items():
            q.append(s)
            f_ = g.f
            while x not in f_.c and f_ != r:
                f_ = f_.f
            s.f = f_.c.get(x, r)
            s.o += s.f.o  # inherit outputs from failure links


# =========================
# Signature loading & validation
# =========================
def is_valid_token(tok):
    """Check if a token is a valid hex byte or wildcard."""
    return tok == '??' or re.fullmatch(r'[0-9A-Fa-f]{2}', tok) is not None


def parse_signature_line(line):
    """Parse a signature line, accepting hex, ?? and * wildcards."""
    if '=' not in line:
        return None
    name, raw = line.strip().split('=', 1)

    # Accept only allowed characters (0-9, A-F, ?, *)
    raw = raw.strip().upper().replace(' ', '')
    if not re.fullmatch(r'[0-9A-F\?\*]+', raw):
        return None

    # Replace wildcards with a neutral byte (00) to allow Aho–Corasick build
    cleaned = raw.replace('??', '00').replace('*', '')
    if len(cleaned) % 2 != 0:
        return None  # Not a valid even-length hex string
    return cleaned



def e(sf):
    """Load and validate .db signature file."""
    p = []
    skipped = 0
    try:
        if os.path.getsize(sf) == 0:
            raise ValueError("Empty signature file.")

        with open(sf, 'r', encoding='utf-8') as u:
            for v in u:
                if '=' in v:
                    parsed = parse_signature_line(v)
                    if parsed:
                        p.append(parsed)
                    else:
                        skipped += 1

        if not p:
            raise ValueError("No valid signatures loaded.")

        r = b(p)
        d(r)
        msg = f"Loaded {len(p)} valid signatures"
        if skipped:
            msg += f", skipped {skipped} invalid lines."
        m.showinfo("Signature File Loaded", msg)
        return r, len(p)
    except Exception as ex:
        m.showerror("Error", f"Failed to load signature file:\n{ex}")
        return None, 0


# =========================
# File scanning
# =========================
def h(fp, r, w=1024):
    """Scan a file using the constructed automaton."""
    try:
        with open(fp, 'rb') as u:
            if os.path.getsize(fp) < w:
                u.seek(0)
                v = u.read()
            else:
                v = u.read(w)
                u.seek(-w, os.SEEK_END)
                v += u.read(w)
            y = v.hex().upper()
            z = a(y, r)
            if z:
                f_, g_ = z[0]
                pos = f_ / (2 * len(v))
                return True, f"\nFile: {fp}\nMalicious! ^\nSignature: {g_}\nLocation:", pos
            return False, f"\nFile: {fp}\nClean file!\nLocation:", None
    except Exception as s:
        return False, f"\nFile: {fp}\nStatus: Error - {s}\n", None


# =========================
# GUI logic
# =========================
def j(pos, w=40):
    if pos is None:
        return "-" * w
    q = int(w * pos)
    return "-" * q + "+" + "-" * (w - q - 1)


def l():
    A = None
    B = 0

    C = t.Tk()
    C.title('Hex Signature Scanner (Gagniuc - Scut Antivirus)')

    def L():
        nonlocal A, B
        sf = f.askopenfilename(
            title="Select Signature File",
            filetypes=(("Database files", "*.db"), ("All files", "*.*"))
        )
        if sf:
            A, B = e(sf)
            if A:
                M.config(state=t.NORMAL)
                D.config(text=f"Total Signatures: {B}")

    def Q():
        nonlocal A
        if not A:
            m.showinfo("Information", "Load signature file first.")
            return

        x = {y.strip() for y in G.get().split(',')}
        if not x:
            m.showinfo("Information", "Specify file extensions to scan.")
            return

        y = f.askdirectory()
        if y:
            I.delete(1.0, t.END)
            F = []
            for R, _, S in os.walk(y):
                F.extend(
                    os.path.join(R, u)
                    for u in S
                    if any(u.lower().endswith(z.lower()) for z in x)
                )
            H['maximum'] = len(F)
            ic = 0
            cc = 0
            start = time.time()
            for fp in F:
                inf, res, pos = h(fp, A)
                gp = j(pos)
                I.insert(t.END, f"{res} [{gp}]\n")
                I.update_idletasks()
                H['value'] += 1
                H.update_idletasks()
                if inf:
                    ic += 1
                else:
                    cc += 1
            end = time.time()
            elapsed = end - start
            J.config(
                text=f"Scan Summary: {ic} Infected, {cc} Clean. Time Elapsed: {elapsed:.2f} sec"
            )

    K = t.Button(C, text="Load Signature", command=L)
    K.pack(pady=10)

    D = t.Label(C, text="Total Signatures: 0")
    D.pack(pady=10)

    G = t.Entry(C)
    G.pack(pady=10)
    G.insert(0, ".exe,.dll,.docx")

    M = t.Button(C,
                 text="Browse Folder and Scan",
                 command=Q,
                 state=t.DISABLED)
    M.pack(pady=10)

    H = k.Progressbar(C,
                      orient="horizontal",
                      length=300,
                      mode="determinate")
    H.pack(pady=10)

    J = t.Label(C, text="Scan summary: Not started")
    J.pack(pady=10)

    I = s.ScrolledText(C,
                       wrap=t.WORD,
                       width=80,
                       height=20)
    I.pack(pady=10, padx=10)

    C.mainloop()


if __name__ == "__main__":
    l()

# References
# Gagniuc, P.A.; Păvăloiu, I.B.; Dascălu, M.I. The Aho-Corasick Paradigm in Modern Antivirus 
# Engines:A Cornerstone of Signature-Based Malware Detection. Algorithms 2025, 18, 742.


Note: The repositories below showcase a few demo algorithmic implementations provided from the total of 127 algorithms presented in Antivirus Engines: From Methods to Innovations, Design, and Applications (Elsevier Syngress, 2024). You may find these standalone tools insightful:


References

  • Gagniuc, P.A.; Păvăloiu, I.B.; Dascălu, M.I. The Aho-Corasick Paradigm in Modern Antivirus Engines: A Cornerstone of Signature-Based Malware Detection. Algorithms 2025, 18, 742.
  • Paul A. Gagniuc. Antivirus Engines: From Methods to Innovations, Design, and Applications. Cambridge, MA: Elsevier Syngress, 2024. pp. 1-656.

About

This complete implementation represents a fully featured and fault-tolerant version of a malware scanner.

Topics

Resources

License

Stars

Watchers

Forks

Contributors

Languages