77from collections import Counter
88from datetime import datetime
99from pathlib import Path
10- from typing import Any , Optional
10+ from typing import Any , Callable , Optional
1111from urllib .parse import parse_qs , quote , urlparse
1212
1313from fastapi import HTTPException
@@ -787,7 +787,112 @@ def _to_float(value: Any) -> Optional[float]:
787787 }
788788
789789
790- def _parse_system_message_content (raw_text : str ) -> str :
790+ def _extract_chatroom_top_message_metadata (raw_text : str ) -> dict [str , str ]:
791+ text = str (raw_text or "" ).strip ()
792+ if not text :
793+ return {}
794+
795+ lower_text = text .lower ()
796+ if "<mmchatroomtopmsg" in lower_text or "<sysmsg" in lower_text :
797+ chatroom_id = str (_extract_xml_tag_text (text , "chatroomname" ) or "" ).strip ()
798+ operation = str (_extract_xml_tag_text (text , "op" ) or "" ).strip ()
799+ operator_username = str (_extract_xml_tag_text (text , "username" ) or "" ).strip ()
800+ operator_display_name = str (_extract_xml_tag_text (text , "nickname" ) or "" ).strip ()
801+ if chatroom_id .endswith ("@chatroom" ) and operation in {"1" , "2" } and operator_username :
802+ return {
803+ "operation" : operation ,
804+ "operatorUsername" : operator_username ,
805+ "operatorDisplayName" : operator_display_name ,
806+ }
807+
808+ def _is_int_token (value : str ) -> bool :
809+ candidate = str (value or "" ).strip ()
810+ if not candidate :
811+ return False
812+ if candidate [0 ] in {"+" , "-" }:
813+ candidate = candidate [1 :]
814+ return candidate .isdigit ()
815+
816+ normalized = re .sub (r"<!--\s*ChatRoomTopMsgRequest\s*-->" , " " , text , flags = re .IGNORECASE )
817+ normalized = re .sub (r"<!--\s*ChatRoomTopMsgResponse\s*-->" , " " , normalized , flags = re .IGNORECASE )
818+ normalized = re .sub (r"\s+" , " " , normalized ).strip ()
819+ if not normalized :
820+ return {}
821+
822+ parts = normalized .split (" " )
823+ has_markers = ("chatroomtopmsgrequest" in lower_text ) or ("chatroomtopmsgresponse" in lower_text )
824+ if len (parts ) < 5 :
825+ return {}
826+
827+ chatroom_id = str (parts [0 ] or "" ).strip ()
828+ operation = str (parts [1 ] or "" ).strip ()
829+ if not chatroom_id .endswith ("@chatroom" ):
830+ return {}
831+ if operation not in {"1" , "2" }:
832+ return {}
833+
834+ if not has_markers :
835+ if len (parts ) < 6 :
836+ return {}
837+ if not _is_int_token (parts [2 ]) or not _is_int_token (parts [3 ]) or not _is_int_token (parts [5 ]):
838+ return {}
839+
840+ operator_username = str (parts [4 ] or "" ).strip ()
841+ if not operator_username :
842+ return {}
843+
844+ operator_display_name = ""
845+ if len (parts ) >= 6 and _is_int_token (parts [5 ]):
846+ response_tokens = parts [6 :]
847+ if len (response_tokens ) >= 2 and _is_int_token (response_tokens [- 1 ]):
848+ response_tokens = response_tokens [:- 1 ]
849+ operator_display_name = " " .join (response_tokens ).strip ()
850+
851+ return {
852+ "operation" : operation ,
853+ "operatorUsername" : operator_username ,
854+ "operatorDisplayName" : operator_display_name ,
855+ }
856+
857+
858+ def _parse_chatroom_top_message (
859+ raw_text : str ,
860+ resolve_display_name : Optional [Callable [[str , str ], str ]] = None ,
861+ ) -> str :
862+ meta = _extract_chatroom_top_message_metadata (raw_text )
863+ if not meta :
864+ return ""
865+
866+ operation = str (meta .get ("operation" ) or "" ).strip ()
867+ operator_username = str (meta .get ("operatorUsername" ) or "" ).strip ()
868+ operator_display_name = str (meta .get ("operatorDisplayName" ) or "" ).strip ()
869+
870+ if resolve_display_name is not None and operator_username :
871+ try :
872+ resolved = str (resolve_display_name (operator_username , operator_display_name ) or "" ).strip ()
873+ except Exception :
874+ resolved = ""
875+ if resolved :
876+ operator_display_name = resolved
877+
878+ if not operator_display_name :
879+ operator_display_name = operator_username or "有人"
880+
881+ action_map = {
882+ "1" : "置顶了一条消息" ,
883+ "2" : "移除了一条置顶消息" ,
884+ }
885+ action = action_map .get (operation )
886+ if not action :
887+ return ""
888+
889+ return f"{ operator_display_name } { action } "
890+
891+
892+ def _parse_system_message_content (
893+ raw_text : str ,
894+ resolve_display_name : Optional [Callable [[str , str ], str ]] = None ,
895+ ) -> str :
791896 text = str (raw_text or "" ).strip ()
792897 if not text :
793898 return "[系统消息]"
@@ -801,12 +906,17 @@ def _clean_system_text(value: str) -> str:
801906 if nested_content :
802907 candidate = nested_content
803908
909+ candidate = re .sub (r"<!--.*?-->" , " " , candidate , flags = re .IGNORECASE | re .DOTALL )
804910 candidate = re .sub (r"<!\[CDATA\[" , "" , candidate , flags = re .IGNORECASE )
805911 candidate = re .sub (r"\]\]>" , "" , candidate )
806912 candidate = re .sub (r"</?[_a-zA-Z0-9]+[^>]*>" , "" , candidate )
807913 candidate = re .sub (r"\s+" , " " , candidate ).strip ()
808914 return candidate
809915
916+ top_message_text = _parse_chatroom_top_message (text , resolve_display_name = resolve_display_name )
917+ if top_message_text :
918+ return top_message_text
919+
810920 if "revokemsg" in text .lower ():
811921 replace_msg = _extract_xml_tag_text (text , "replacemsg" )
812922 cleaned_replace_msg = _clean_system_text (replace_msg )
@@ -2334,4 +2444,5 @@ def _row_to_search_hit(
23342444 "locationLng" : location_lng ,
23352445 "locationPoiname" : location_poiname ,
23362446 "locationLabel" : location_label ,
2447+ "_rawText" : raw_text if local_type in (10000 , 266287972401 ) else "" ,
23372448 }
0 commit comments