@@ -402,6 +402,72 @@ class CaptionStyle:
402402}
403403
404404
405+ def detect_keywords_nlp (
406+ words : List [Word ],
407+ top_n : int = 15 ,
408+ ) -> Set [int ]:
409+ """Detect important/keyword words using NLP-inspired frequency analysis.
410+
411+ Uses a TF-IDF-like approach: words that are rare in general English but
412+ present in this transcript are likely important. Combines with POS-like
413+ heuristics (capitalized words, longer words = more important).
414+
415+ No external dependencies — uses only stdlib.
416+ """
417+ if not words :
418+ return set ()
419+
420+ # Common English stopwords to exclude
421+ _STOPWORDS = frozenset ({
422+ "i" , "me" , "my" , "we" , "our" , "you" , "your" , "he" , "she" , "it" , "they" ,
423+ "them" , "his" , "her" , "its" , "this" , "that" , "these" , "those" , "is" , "am" ,
424+ "are" , "was" , "were" , "be" , "been" , "being" , "have" , "has" , "had" , "do" ,
425+ "does" , "did" , "will" , "would" , "shall" , "should" , "may" , "might" , "can" ,
426+ "could" , "must" , "a" , "an" , "the" , "and" , "but" , "or" , "if" , "then" ,
427+ "so" , "as" , "of" , "in" , "on" , "at" , "to" , "for" , "with" , "by" , "from" ,
428+ "up" , "out" , "not" , "no" , "just" , "very" , "really" , "also" , "too" ,
429+ "about" , "into" , "over" , "after" , "before" , "between" , "through" ,
430+ "when" , "where" , "how" , "what" , "which" , "who" , "all" , "each" , "every" ,
431+ "both" , "few" , "more" , "most" , "other" , "some" , "such" , "than" , "only" ,
432+ "own" , "same" , "here" , "there" , "now" , "then" , "once" , "again" ,
433+ "going" , "gonna" , "like" , "know" , "think" , "want" , "need" , "get" , "got" ,
434+ "make" , "take" , "come" , "go" , "see" , "look" , "say" , "said" , "tell" ,
435+ "give" , "let" , "put" , "well" , "okay" , "yeah" , "yes" , "right" , "oh" , "um" ,
436+ "uh" , "ah" , "so" , "because" , "actually" , "basically" , "literally" ,
437+ })
438+
439+ # Score each word
440+ scores = []
441+ for i , w in enumerate (words ):
442+ clean = w .text .strip ().lower ().strip (".,!?;:\" '()-" )
443+ if not clean or clean in _STOPWORDS or len (clean ) <= 2 :
444+ scores .append ((i , 0.0 ))
445+ continue
446+
447+ score = 0.0
448+ # Length bonus (longer words tend to be more meaningful)
449+ score += min (len (clean ) / 8.0 , 1.0 ) * 0.3
450+ # Capitalization bonus (proper nouns, emphasis)
451+ if w .text .strip () and w .text .strip ()[0 ].isupper ():
452+ score += 0.2
453+ # Already in action keywords = strong signal
454+ if clean in _ACTION_KEYWORDS :
455+ score += 0.5
456+ # Number = often important (stats, years, amounts)
457+ if any (c .isdigit () for c in clean ):
458+ score += 0.3
459+ # Rarity bonus: words appearing fewer times get higher score
460+ freq = sum (1 for ww in words if ww .text .strip ().lower ().strip (".,!?;:\" '()-" ) == clean )
461+ if freq <= 2 :
462+ score += 0.2
463+
464+ scores .append ((i , score ))
465+
466+ # Take top_n highest scoring words
467+ scores .sort (key = lambda x : x [1 ], reverse = True )
468+ return {idx for idx , score in scores [:top_n ] if score > 0.3 }
469+
470+
405471def detect_action_words_by_energy (
406472 filepath : str ,
407473 words : List [Word ],
@@ -451,11 +517,17 @@ def get_action_word_indices(
451517 all_words : List [Word ],
452518 custom_words : Optional [List [str ]] = None ,
453519 use_keywords : bool = True ,
520+ use_nlp : bool = True ,
454521 energy_indices : Optional [Set [int ]] = None ,
455522) -> Set [int ]:
456- """Combine keyword list, custom words, and energy analysis for action words."""
523+ """Combine keyword list, NLP analysis, custom words, and energy analysis for action words."""
457524 result = set ()
458525
526+ # NLP-based keyword detection (frequency/importance analysis)
527+ if use_nlp and all_words :
528+ nlp_indices = detect_keywords_nlp (all_words )
529+ result .update (nlp_indices )
530+
459531 keywords = set ()
460532 if use_keywords :
461533 keywords .update (_ACTION_KEYWORDS )
0 commit comments