Skip to content

Commit 3aa8700

Browse files
Brett ShirleyMsftBrettShirley
authored andcommitted
Miscellaneous diagnostic enhancements and corruption resiliency fixes.
Miscellaneous page corruption handing and diagnostic enhancements and other misc fixes. - Added logging Corruption failure item for redo replace of node with a too big or zero size. - Upgraded theses checks form AssertTrack to full HA Failure Item corruption throws ... o ErrGetPtr returns line.pv == NULL o ErrGetPtr returns a line with last byte below data-section of the page. o ErrGetPtr returns a line with first byte below data-section of the page. o ErrNDIGetKeydataflags returns a kdf with .data.Cb() == 0. o When ErrNDIGetKeydataflags returns an error. - Added and plumbed (via IPageValidationAction) a page validation reason to communicate to ErrCheckPage() and dump in ErrDumpToIrsRaw(). - Fixed retail AssertFail to correctly format varargs. Causes datacenter optics Code Enforcement report info to not have intended data / varargs. o Note: Seeing "GETLINEFAILED:%D" instead of "GETLINEFAILED:-359". - Restructured how we construct corruption details across ErrCheckPage, so details have a common prefix of page validation reason and pgno. - Commonalized how we format and report some issue outputs and sources in some paths. - Fixed some miscellaneous wrong output in eseutil /vss options. - Added or enhanced some totally miscellaneous and unrelated random comments. [Substrate:83c539a5d58c8741c06947a00ad6fbe8f658b4c9]
1 parent 1794c60 commit 3aa8700

19 files changed

Lines changed: 700 additions & 333 deletions

File tree

dev/ese/src/_res/jetmsg.mc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1469,7 +1469,7 @@ Additional information:%n
14691469
MessageId=540
14701470
SymbolicName=DB_DIVERGENCE_ID
14711471
Language=English
1472-
%1 (%2) %3Database %4: Page %5 in a B-Tree (ObjectId: %11) logical data checksum %6 failed to match logged scan check %7 checksum (seed %8) at log position %9.%n
1472+
%1 (%2) %3Database %4: Page %5 in a B-Tree (ObjectId: %11) logical data checksum %6 failed to match logged scan check %7 checksum (seed %8) at log position %9 / dbtime %14.%n
14731473
Additional information:%n
14741474
%tSource: %10%n
14751475
%tSourceObjidInvalid: %12%n

dev/ese/src/ese/_log/logredo.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3546,6 +3546,7 @@ ERR LOG::ErrLGRIRedoNodeOperation( const LRNODE_ *plrnode, ERR *perr )
35463546
// Assert( cb < sizeof( rgbRecNew ) );
35473547
if ( cb >= (SIZE_T)g_cbPage )
35483548
{
3549+
OSUHAEmitFailureTag( m_pinst, HaDbFailureTagCorruption, L"dba4c055-bbbf-4fd1-a56d-e786519803eb" );
35493550
Error( ErrERRCheck( JET_errLogCorrupted ) );
35503551
}
35513552

@@ -3560,6 +3561,7 @@ ERR LOG::ErrLGRIRedoNodeOperation( const LRNODE_ *plrnode, ERR *perr )
35603561
Assert( (ULONG)data.Cb() == cbNewData );
35613562
if ( (ULONG)data.Cb() != cbNewData )
35623563
{
3564+
OSUHAEmitFailureTag( m_pinst, HaDbFailureTagCorruption, L"a3cb57b9-8ba1-496d-a6fc-4fc2f0140fc4" );
35633565
Error( ErrERRCheck( JET_errLogCorrupted ) );
35643566
}
35653567

@@ -9913,6 +9915,7 @@ ERR LOG::ErrLGRIRedoScanCheck( const LRSCANCHECK2 * const plrscancheck, BOOL* co
99139915
OSFormatW( L"%u", objidPage ),
99149916
OSFormatW( L"%d", (INT)plrscancheck->FObjidInvalid() ),
99159917
OSFormatW( L"%d", (INT)plrscancheck->FEmptyPage() ),
9918+
OSFormatW( L"%I64d", dbtimePageInLogRec ),
99169919
};
99179920

99189921
UtilReportEvent(

dev/ese/src/ese/_osu/eventu.cxx

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ WCHAR WchReportInstState( const INST * const pinst )
2323
return '-'; // actually unknown!?
2424
}
2525

26+
CBasicProcessInfo::CBasicProcessInfo( const INST * const pinst )
27+
{
28+
#ifdef ESENT
29+
// I (SOMEONE) believe that DwUtilImageVersionMajor() and DwUtilImageBuildNumberMajor() reports false numbers for various
30+
// poorly thought out / invalid (*IMNHO*) reasons, would rather not report anything at all than bad / false numbers.
31+
OSStrCbFormatW( m_wszStateInfoEx, sizeof(m_wszStateInfoEx), L"%I32u,%wc,%I32u", DwUtilProcessId(), WchReportInstState( pinst ), OpTraceContext() );
32+
#else
33+
// Format of Bu.il.dNum.ber copied from FOSSysinfoPreinit() for Exchange.
34+
OSStrCbFormatW( m_wszStateInfoEx, sizeof(m_wszStateInfoEx), L"%I32u,%wc,%I32u,%02I32u.%02I32u.%04I32u.%03I32u", DwUtilProcessId(), WchReportInstState( pinst ), OpTraceContext(),
35+
DwUtilImageVersionMajor(), DwUtilImageVersionMinor(), DwUtilImageBuildNumberMajor(), DwUtilImageBuildNumberMinor() );
36+
#endif
37+
}
38+
2639
void UtilReportEvent(
2740
const EEventType type,
2841
const CategoryId catid,
@@ -62,17 +75,8 @@ void UtilReportEvent(
6275
rgpsz[ 0 ] = WszUtilProcessFriendlyName();
6376
}
6477

65-
WCHAR wszStateInfoEx[16 + 2 + 4 + 20]; // PID + Recovery/Undo/Do-time/Term mode + ApiOpCode + Bu.il.dNum.ber
66-
#ifdef ESENT
67-
// I (SOMEONE) believe that DwUtilImageVersionMajor() and DwUtilImageBuildNumberMajor() reports false numbers for various
68-
// poorly thought out / invalid (*IMNHO*) reasons, would rather not report anything at all than bad / false numbers.
69-
OSStrCbFormatW( wszStateInfoEx, sizeof(wszStateInfoEx), L"%I32u,%wc,%I32u", DwUtilProcessId(), WchReportInstState( pinst ), OpTraceContext() );
70-
#else
71-
// Format of Bu.il.dNum.ber copied from FOSSysinfoPreinit() for Exchange.
72-
OSStrCbFormatW( wszStateInfoEx, sizeof(wszStateInfoEx), L"%I32u,%wc,%I32u,%02I32u.%02I32u.%04I32u.%03I32u", DwUtilProcessId(), WchReportInstState( pinst ), OpTraceContext(),
73-
DwUtilImageVersionMajor(), DwUtilImageVersionMinor(), DwUtilImageBuildNumberMajor(), DwUtilImageBuildNumberMinor() );
74-
#endif
75-
rgpsz[1] = wszStateInfoEx;
78+
CBasicProcessInfo procinfo( pinst );
79+
rgpsz[1] = procinfo.Wsz();
7680

7781
WCHAR wszDisplayName[JET_cbFullNameMost + 3]; // 3 = sizof(": ") + 1
7882
/*

dev/ese/src/ese/bf.cxx

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18614,6 +18614,12 @@ ERR ErrBFIValidatePageSlowly( PBF pbf, const BFLatchType bflt, const CPageEvents
1861418614
BOOL fPageValidated = fFalse; // Doesn't mean page is valid, means that we fully validated the page to the best
1861518615
// of our abilities (which cannot always be performed under shared latch).
1861618616

18617+
PAGEValidationReason pgvr = pgvr::UserReadPageIo;
18618+
if ( tc.iorReason.Iort() == iortDbScan )
18619+
{
18620+
pgvr = pgvr::DbScanReadPageIo;
18621+
}
18622+
1861718623
while ( !fPageValidated )
1861818624
{
1861918625
ERR errFault = JET_errSuccess;
@@ -18693,7 +18699,7 @@ ERR ErrBFIValidatePageSlowly( PBF pbf, const BFLatchType bflt, const CPageEvents
1869318699

1869418700
if ( errValidate >= JET_errSuccess )
1869518701
{
18696-
errValidate = ErrBFIVerifyPage( pbf, cpe, fTrue );
18702+
errValidate = ErrBFIVerifyPage( pbf, pgvr, cpe, fTrue );
1869718703
}
1869818704
if ( errValidate < JET_errSuccess )
1869918705
{
@@ -18908,7 +18914,12 @@ ERR ErrBFIValidatePageSlowly( PBF pbf, const BFLatchType bflt, const CPageEvents
1890818914
// Important, fFalse controls event and if we update the page, the
1890918915
// page may or may not be updatable, so it is important
1891018916

18911-
errValidate = ErrBFIVerifyPage( pbf, CPageValidationLogEvent::LOG_NONE, fFalse );
18917+
// should not happen as main DbScan latches exclusively, but DbScan also comes in for schema lookups to see
18918+
// if table exists. we suppress the pgvr for this case, it is rare - but the value of knowing non-exclusive
18919+
// is more important
18920+
Expected( pgvr != pgvr::DbScanReadPageIo || tc.iorReason.Iors() == iorsBTSeek );
18921+
18922+
errValidate = ErrBFIVerifyPage( pbf, pgvr::UserReadPageIoNonExcl, CPageValidationLogEvent::LOG_NONE, fFalse );
1891218923
}
1891318924

1891418925
// if there is an error in the page then check the status of the page
@@ -19081,7 +19092,7 @@ void BFIPatchRequestIORange( PBF pbf, const CPageEvents cpe, const TraceContext&
1908119092
}
1908219093
}
1908319094

19084-
ERR ErrBFIVerifyPageSimplyWork( const PBF pbf )
19095+
ERR ErrBFIVerifyPageSimplyWork( const PBF pbf, const PAGEValidationReason pgvr )
1908519096
{
1908619097
BYTE rgBFLocal[sizeof(BF)];
1908719098
CPAGE::PGHDR2 pghdr2;
@@ -19093,7 +19104,7 @@ ERR ErrBFIVerifyPageSimplyWork( const PBF pbf )
1909319104
return JET_errSuccess;
1909419105
}
1909519106
memcpy( rgBFLocal, pbf, sizeof(BF) ); // 1. copy off the BF struct ...
19096-
CPageValidationNullAction nullaction;
19107+
CPageValidationNullAction nullaction( pgvr );
1909719108
CPAGE cpage;
1909819109
Assert( CbBFIBufferSize( pbf ) == CbBFIPageSize( pbf ) );
1909919110
cpage.LoadPage( pbf->ifmp, pbf->pgno, pbf->pv, CbBFIBufferSize( pbf ) );
@@ -19126,7 +19137,7 @@ ERR ErrBFIVerifyPageSimplyWork( const PBF pbf )
1912619137
return err;
1912719138
}
1912819139

19129-
ERR ErrBFIVerifyPage( const PBF pbf, const CPageEvents cpe, const BOOL fFixErrors )
19140+
ERR ErrBFIVerifyPage( const PBF pbf, const PAGEValidationReason pgvr, const CPageEvents cpe, const BOOL fFixErrors )
1913019141
{
1913119142
ERR err = JET_errSuccess;
1913219143
CPageEvents cpeActual = cpe;
@@ -19173,6 +19184,7 @@ ERR ErrBFIVerifyPage( const PBF pbf, const CPageEvents cpe, const BOOL fFixError
1917319184
}
1917419185

1917519186
CPageValidationLogEvent validationaction(
19187+
pgvr,
1917619188
pbf->ifmp,
1917719189
cpeActual,
1917819190
BUFFER_MANAGER_CATEGORY );
@@ -19565,6 +19577,7 @@ bool FBFIDestructiveSoftFaultPage( PBF pbf, _In_ const BOOL fNewPage )
1956519577
// validate the page with the Page Manager
1956619578

1956719579
const ERR errCheckPage = cpageCheck.ErrCheckPage( CPRINTFDBGOUT::PcprintfInstance(),
19580+
pbf->bfdf == bfdfClean ? pgvr::ReclaimCleanFromOsAvail : pgvr::ReclaimUntidyFromOsAvail,
1956819581
OnDebugOrRetail( ( fCorruptOnPageIn ? CPAGE::OnErrorReturnError : CPAGE::OnErrorFireWall ), CPAGE::OnErrorFireWall ),
1956919582
CPAGE::CheckLineBoundedByTag );
1957019583

@@ -19650,6 +19663,8 @@ void BFIReclaimPageFromOS(
1965019663
(DWORD)CbBFIBufferSize( pbf ) >= OSMemoryPageCommitGranularity() &&
1965119664
( pbf->bfdf < bfdfDirty ) );
1965219665

19666+
PAGEValidationReason pgvr = pgvr::Invalid;
19667+
1965319668
//
1965419669
// First we try to ratchet our latch type as high as possible to use the best reclaim mechanism we can support.
1965519670
//
@@ -19764,6 +19779,10 @@ void BFIReclaimPageFromOS(
1976419779
{
1976519780
// If we have the write latch, then we will be trying a destructive soft fault, and so
1976619781
// we must make sure the page is not updatable.
19782+
19783+
// FUTURE: We could probably make a small improvement to our fault handling by setting
19784+
// fAttemptDestructiveSoftFault = fFalse and continuing on. These not Updatable (recently
19785+
// written) pages are probably rare - as we just checksummed them before the Write IO.
1976719786
goto HandleError;
1976819787
}
1976919788

@@ -19833,6 +19852,7 @@ void BFIReclaimPageFromOS(
1983319852
memset( pbf->pv, 0, sizeof(CPAGE::PGHDR2) );
1983419853
}
1983519854

19855+
pgvr = pbf->bfdf == bfdfClean ? pgvr::ReclaimCleanFromOsAvail : pgvr::ReclaimUntidyFromOsAvail;
1983619856
}
1983719857
else
1983819858
#ifdef EXTRA_LATCHLESS_IO_CHECKS
@@ -19862,8 +19882,19 @@ void BFIReclaimPageFromOS(
1986219882
BFITrackCacheMissLatency( pbf, hrtFaultStart, bftcmrReasonPagingFaultOs, qos, tc, JET_errSuccess );
1986319883
}
1986419884

19885+
pgvr = pbf->bfdf < bfdfDirty ? pgvr::ReclaimCleanUntidyFromOs : pgvr::ReclaimDirtyFromOs;
19886+
1986519887
OSTrace( JET_tracetagBufferManager, OSFormat( "BF forced fault (soft or hard fault) from the OS ifmp:pgno %d:%d in %I64d ms", (ULONG)pbf->ifmp, pbf->pgno, cusecFaultTime / 1000 ) );
1986619888
}
19889+
#ifdef EXTRA_LATCHLESS_IO_CHECKS
19890+
else
19891+
{
19892+
pgvr = pbf->bfdf < bfdfDirty ? pgvr::ReclaimCleanUntidyFromOs : pgvr::ReclaimDirtyFromOs;
19893+
#ifndef DEBUG
19894+
AssertTrack( fFalse, "LatchlessIoChecksAreDebugOnly" ); // should be only debug path
19895+
#endif
19896+
}
19897+
#endif
1986719898

1986819899

1986919900
if ( bfltAchieved >= bfltExclusive && FBFIDatabasePage( pbf ) && pbf->err >= JET_errSuccess && !fNewPage )
@@ -19884,10 +19915,22 @@ void BFIReclaimPageFromOS(
1988419915
}
1988519916
#endif
1988619917

19887-
(void)cpageCheck.ErrCheckPage( CPRINTFDBGOUT::PcprintfInstance(),
19918+
const ERR errCheck = cpageCheck.ErrCheckPage( CPRINTFDBGOUT::PcprintfInstance(),
19919+
pgvr,
1988819920
CPAGE::OnErrorFireWall,
1988919921
CPAGE::CheckLineBoundedByTag );
1989019922
// void b/c we shouldn't return from the above call on failure.
19923+
if ( errCheck < JET_errSuccess )
19924+
{
19925+
switch( pgvr )
19926+
{
19927+
case pgvr::ReclaimCleanFromOsAvail: FireWall( "CorruptCleanPageReclaimedFromOSAvail" ); break;
19928+
case pgvr::ReclaimUntidyFromOsAvail: FireWall( "CorruptUntidyPageReclaimedFromOSAvail" ); break;
19929+
case pgvr::ReclaimCleanUntidyFromOs: FireWall( "CorruptCleanUntidyPageReclaimedFromOS" ); break;
19930+
case pgvr::ReclaimDirtyFromOs: FireWall( "CorruptDirtyPageReclaimedFromOS" ); break;
19931+
default: FireWall( "CorruptHuh" ); // all cases should be covered.
19932+
}
19933+
}
1989119934
}
1989219935
}
1989319936

@@ -22233,7 +22276,7 @@ bool CBFOpportuneWriter::FVerifyCleanBFs_()
2223322276
}
2223422277
}
2223522278

22236-
const ERR errReVerify = ErrBFIVerifyPage( pbf, CPageValidationLogEvent::LOG_NONE, fFalse );
22279+
const ERR errReVerify = ErrBFIVerifyPage( pbf, pgvr::WritePageIo, CPageValidationLogEvent::LOG_NONE, fFalse );
2223722280
if ( errPreVerify >= JET_errSuccess && errReVerify == JET_errPageNotInitialized )
2223822281
{
2223922282
// We can try and overwrite with the not initialized page, but it happens so
@@ -25380,7 +25423,7 @@ BOOL FBFICacheRemapPage( __inout PBF pbf, IFileAPI* const pfapi )
2538025423
Assert( CbBFIBufferSize( pbf ) == CbBFIPageSize( pbf ) );
2538125424
cpage.ReBufferPage( bfl, pbf->ifmp, pbf->pgno, pbf->pv, g_rgcbPageSize[pbf->icbPage] );
2538225425

25383-
CPageValidationLogEvent validationaction( pbf->ifmp, CPageValidationLogEvent::LOG_NONE, BUFFER_MANAGER_CATEGORY );
25426+
CPageValidationLogEvent validationaction( pgvr::RemapPageWriteIoComplete, pbf->ifmp, CPageValidationLogEvent::LOG_NONE, BUFFER_MANAGER_CATEGORY );
2538425427

2538525428
errCheckPage = cpage.ErrValidatePage( pgvfExtensiveChecks | pgvfDoNotCheckForLostFlush, &validationaction );
2538625429
// should we check flush map here?
@@ -25425,7 +25468,7 @@ BOOL FBFICacheRemapPage( __inout PBF pbf, IFileAPI* const pfapi )
2542525468
FBFIDatabasePage( pbf ) )
2542625469
{
2542725470
CPAGE cpage;
25428-
CPageValidationNullAction nullaction;
25471+
CPageValidationNullAction nullaction( pgvr::DebugCheck );
2542925472
cpage.LoadPage( pbf->ifmp, pbf->pgno, pbf->pv, g_rgcbPageSize[pbf->icbPage] );
2543025473
CallS( cpage.ErrValidatePage( pgvfDoNotCheckForLostFlush, &nullaction ) );
2543125474
}
@@ -25674,7 +25717,12 @@ void BFIFlushComplete( _Inout_ const PBF pbf, _In_ const BFLatchType bfltHave, _
2567425717
// for uninitialized pages because we could throw bogus lost flush errors.
2567525718
if ( !FBFIBufferIsZeroed( pbf ) )
2567625719
{
25677-
CallS( ErrBFIVerifyPageSimplyWork( pbf ) );
25720+
#ifdef DEBUG
25721+
// EXTRA_LATCHLESS_IO_CHECKS is only in debug, if someone decides to turn it on for
25722+
// retail, this will break and they'll give it a real reason.
25723+
PAGEValidationReason pgvr = pgvr::DebugCheck;
25724+
#endif
25725+
CallS( ErrBFIVerifyPageSimplyWork( pbf, pgvr ) );
2567825726
}
2567925727

2568025728
AssertRTL( CbBFIBufferSize( pbf ) == CbBFIPageSize( pbf ) );

0 commit comments

Comments
 (0)