000001 /* 000002 ** 2004 April 6 000003 ** 000004 ** The author disclaims copyright to this source code. In place of 000005 ** a legal notice, here is a blessing: 000006 ** 000007 ** May you do good and not evil. 000008 ** May you find forgiveness for yourself and forgive others. 000009 ** May you share freely, never taking more than you give. 000010 ** 000011 ************************************************************************* 000012 ** This file implements an external (disk-based) database using BTrees. 000013 ** See the header comment on "btreeInt.h" for additional information. 000014 ** Including a description of file format and an overview of operation. 000015 */ 000016 #include "btreeInt.h" 000017 000018 /* 000019 ** The header string that appears at the beginning of every 000020 ** SQLite database. 000021 */ 000022 static const char zMagicHeader[] = SQLITE_FILE_HEADER; 000023 000024 /* 000025 ** Set this global variable to 1 to enable tracing using the TRACE 000026 ** macro. 000027 */ 000028 #if 0 000029 int sqlite3BtreeTrace=1; /* True to enable tracing */ 000030 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} 000031 #else 000032 # define TRACE(X) 000033 #endif 000034 000035 /* 000036 ** Extract a 2-byte big-endian integer from an array of unsigned bytes. 000037 ** But if the value is zero, make it 65536. 000038 ** 000039 ** This routine is used to extract the "offset to cell content area" value 000040 ** from the header of a btree page. If the page size is 65536 and the page 000041 ** is empty, the offset should be 65536, but the 2-byte value stores zero. 000042 ** This routine makes the necessary adjustment to 65536. 000043 */ 000044 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1) 000045 000046 /* 000047 ** Values passed as the 5th argument to allocateBtreePage() 000048 */ 000049 #define BTALLOC_ANY 0 /* Allocate any page */ 000050 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */ 000051 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */ 000052 000053 /* 000054 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 000055 ** defined, or 0 if it is. For example: 000056 ** 000057 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum); 000058 */ 000059 #ifndef SQLITE_OMIT_AUTOVACUUM 000060 #define IfNotOmitAV(expr) (expr) 000061 #else 000062 #define IfNotOmitAV(expr) 0 000063 #endif 000064 000065 #ifndef SQLITE_OMIT_SHARED_CACHE 000066 /* 000067 ** A list of BtShared objects that are eligible for participation 000068 ** in shared cache. This variable has file scope during normal builds, 000069 ** but the test harness needs to access it so we make it global for 000070 ** test builds. 000071 ** 000072 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN. 000073 */ 000074 #ifdef SQLITE_TEST 000075 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000076 #else 000077 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; 000078 #endif 000079 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000080 000081 #ifndef SQLITE_OMIT_SHARED_CACHE 000082 /* 000083 ** Enable or disable the shared pager and schema features. 000084 ** 000085 ** This routine has no effect on existing database connections. 000086 ** The shared cache setting effects only future calls to 000087 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). 000088 */ 000089 int sqlite3_enable_shared_cache(int enable){ 000090 sqlite3GlobalConfig.sharedCacheEnabled = enable; 000091 return SQLITE_OK; 000092 } 000093 #endif 000094 000095 000096 000097 #ifdef SQLITE_OMIT_SHARED_CACHE 000098 /* 000099 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), 000100 ** and clearAllSharedCacheTableLocks() 000101 ** manipulate entries in the BtShared.pLock linked list used to store 000102 ** shared-cache table level locks. If the library is compiled with the 000103 ** shared-cache feature disabled, then there is only ever one user 000104 ** of each BtShared structure and so this locking is not necessary. 000105 ** So define the lock related functions as no-ops. 000106 */ 000107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK 000108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK 000109 #define clearAllSharedCacheTableLocks(a) 000110 #define downgradeAllSharedCacheTableLocks(a) 000111 #define hasSharedCacheTableLock(a,b,c,d) 1 000112 #define hasReadConflicts(a, b) 0 000113 #endif 000114 000115 #ifdef SQLITE_DEBUG 000116 /* 000117 ** Return and reset the seek counter for a Btree object. 000118 */ 000119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){ 000120 u64 n = pBt->nSeek; 000121 pBt->nSeek = 0; 000122 return n; 000123 } 000124 #endif 000125 000126 /* 000127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single 000128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL. 000129 ** 000130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to 000131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message 000132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented 000133 ** with the page number and filename associated with the (MemPage*). 000134 */ 000135 #ifdef SQLITE_DEBUG 000136 int corruptPageError(int lineno, MemPage *p){ 000137 char *zMsg; 000138 sqlite3BeginBenignMalloc(); 000139 zMsg = sqlite3_mprintf("database corruption page %u of %s", 000140 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0) 000141 ); 000142 sqlite3EndBenignMalloc(); 000143 if( zMsg ){ 000144 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg); 000145 } 000146 sqlite3_free(zMsg); 000147 return SQLITE_CORRUPT_BKPT; 000148 } 000149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage) 000150 #else 000151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno) 000152 #endif 000153 000154 /* Default value for SHARED_LOCK_TRACE macro if shared-cache is disabled 000155 ** or if the lock tracking is disabled. This is always the value for 000156 ** release builds. 000157 */ 000158 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) /*no-op*/ 000159 000160 #ifndef SQLITE_OMIT_SHARED_CACHE 000161 000162 #if 0 000163 /* ^---- Change to 1 and recompile to enable shared-lock tracing 000164 ** for debugging purposes. 000165 ** 000166 ** Print all shared-cache locks on a BtShared. Debugging use only. 000167 */ 000168 static void sharedLockTrace( 000169 BtShared *pBt, 000170 const char *zMsg, 000171 int iRoot, 000172 int eLockType 000173 ){ 000174 BtLock *pLock; 000175 if( iRoot>0 ){ 000176 printf("%s-%p %u%s:", zMsg, pBt, iRoot, eLockType==READ_LOCK?"R":"W"); 000177 }else{ 000178 printf("%s-%p:", zMsg, pBt); 000179 } 000180 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000181 printf(" %p/%u%s", pLock->pBtree, pLock->iTable, 000182 pLock->eLock==READ_LOCK ? "R" : "W"); 000183 while( pLock->pNext && pLock->pBtree==pLock->pNext->pBtree ){ 000184 pLock = pLock->pNext; 000185 printf(",%u%s", pLock->iTable, pLock->eLock==READ_LOCK ? "R" : "W"); 000186 } 000187 } 000188 printf("\n"); 000189 fflush(stdout); 000190 } 000191 #undef SHARED_LOCK_TRACE 000192 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) sharedLockTrace(X,MSG,TAB,TYPE) 000193 #endif /* Shared-lock tracing */ 000194 000195 #ifdef SQLITE_DEBUG 000196 /* 000197 **** This function is only used as part of an assert() statement. *** 000198 ** 000199 ** Check to see if pBtree holds the required locks to read or write to the 000200 ** table with root page iRoot. Return 1 if it does and 0 if not. 000201 ** 000202 ** For example, when writing to a table with root-page iRoot via 000203 ** Btree connection pBtree: 000204 ** 000205 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); 000206 ** 000207 ** When writing to an index that resides in a sharable database, the 000208 ** caller should have first obtained a lock specifying the root page of 000209 ** the corresponding table. This makes things a bit more complicated, 000210 ** as this module treats each table as a separate structure. To determine 000211 ** the table corresponding to the index being written, this 000212 ** function has to search through the database schema. 000213 ** 000214 ** Instead of a lock on the table/index rooted at page iRoot, the caller may 000215 ** hold a write-lock on the schema table (root page 1). This is also 000216 ** acceptable. 000217 */ 000218 static int hasSharedCacheTableLock( 000219 Btree *pBtree, /* Handle that must hold lock */ 000220 Pgno iRoot, /* Root page of b-tree */ 000221 int isIndex, /* True if iRoot is the root of an index b-tree */ 000222 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */ 000223 ){ 000224 Schema *pSchema = (Schema *)pBtree->pBt->pSchema; 000225 Pgno iTab = 0; 000226 BtLock *pLock; 000227 000228 /* If this database is not shareable, or if the client is reading 000229 ** and has the read-uncommitted flag set, then no lock is required. 000230 ** Return true immediately. 000231 */ 000232 if( (pBtree->sharable==0) 000233 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit)) 000234 ){ 000235 return 1; 000236 } 000237 000238 /* If the client is reading or writing an index and the schema is 000239 ** not loaded, then it is too difficult to actually check to see if 000240 ** the correct locks are held. So do not bother - just return true. 000241 ** This case does not come up very often anyhow. 000242 */ 000243 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){ 000244 return 1; 000245 } 000246 000247 /* Figure out the root-page that the lock should be held on. For table 000248 ** b-trees, this is just the root page of the b-tree being read or 000249 ** written. For index b-trees, it is the root page of the associated 000250 ** table. */ 000251 if( isIndex ){ 000252 HashElem *p; 000253 int bSeen = 0; 000254 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ 000255 Index *pIdx = (Index *)sqliteHashData(p); 000256 if( pIdx->tnum==iRoot ){ 000257 if( bSeen ){ 000258 /* Two or more indexes share the same root page. There must 000259 ** be imposter tables. So just return true. The assert is not 000260 ** useful in that case. */ 000261 return 1; 000262 } 000263 iTab = pIdx->pTable->tnum; 000264 bSeen = 1; 000265 } 000266 } 000267 }else{ 000268 iTab = iRoot; 000269 } 000270 000271 SHARED_LOCK_TRACE(pBtree->pBt,"hasLock",iRoot,eLockType); 000272 000273 /* Search for the required lock. Either a write-lock on root-page iTab, a 000274 ** write-lock on the schema table, or (if the client is reading) a 000275 ** read-lock on iTab will suffice. Return 1 if any of these are found. */ 000276 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ 000277 if( pLock->pBtree==pBtree 000278 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) 000279 && pLock->eLock>=eLockType 000280 ){ 000281 return 1; 000282 } 000283 } 000284 000285 /* Failed to find the required lock. */ 000286 return 0; 000287 } 000288 #endif /* SQLITE_DEBUG */ 000289 000290 #ifdef SQLITE_DEBUG 000291 /* 000292 **** This function may be used as part of assert() statements only. **** 000293 ** 000294 ** Return true if it would be illegal for pBtree to write into the 000295 ** table or index rooted at iRoot because other shared connections are 000296 ** simultaneously reading that same table or index. 000297 ** 000298 ** It is illegal for pBtree to write if some other Btree object that 000299 ** shares the same BtShared object is currently reading or writing 000300 ** the iRoot table. Except, if the other Btree object has the 000301 ** read-uncommitted flag set, then it is OK for the other object to 000302 ** have a read cursor. 000303 ** 000304 ** For example, before writing to any part of the table or index 000305 ** rooted at page iRoot, one should call: 000306 ** 000307 ** assert( !hasReadConflicts(pBtree, iRoot) ); 000308 */ 000309 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ 000310 BtCursor *p; 000311 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000312 if( p->pgnoRoot==iRoot 000313 && p->pBtree!=pBtree 000314 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit) 000315 ){ 000316 return 1; 000317 } 000318 } 000319 return 0; 000320 } 000321 #endif /* #ifdef SQLITE_DEBUG */ 000322 000323 /* 000324 ** Query to see if Btree handle p may obtain a lock of type eLock 000325 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return 000326 ** SQLITE_OK if the lock may be obtained (by calling 000327 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. 000328 */ 000329 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ 000330 BtShared *pBt = p->pBt; 000331 BtLock *pIter; 000332 000333 assert( sqlite3BtreeHoldsMutex(p) ); 000334 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000335 assert( p->db!=0 ); 000336 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 ); 000337 000338 /* If requesting a write-lock, then the Btree must have an open write 000339 ** transaction on this file. And, obviously, for this to be so there 000340 ** must be an open write transaction on the file itself. 000341 */ 000342 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); 000343 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); 000344 000345 /* This routine is a no-op if the shared-cache is not enabled */ 000346 if( !p->sharable ){ 000347 return SQLITE_OK; 000348 } 000349 000350 /* If some other connection is holding an exclusive lock, the 000351 ** requested lock may not be obtained. 000352 */ 000353 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){ 000354 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); 000355 return SQLITE_LOCKED_SHAREDCACHE; 000356 } 000357 000358 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000359 /* The condition (pIter->eLock!=eLock) in the following if(...) 000360 ** statement is a simplification of: 000361 ** 000362 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) 000363 ** 000364 ** since we know that if eLock==WRITE_LOCK, then no other connection 000365 ** may hold a WRITE_LOCK on any table in this file (since there can 000366 ** only be a single writer). 000367 */ 000368 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); 000369 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); 000370 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ 000371 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); 000372 if( eLock==WRITE_LOCK ){ 000373 assert( p==pBt->pWriter ); 000374 pBt->btsFlags |= BTS_PENDING; 000375 } 000376 return SQLITE_LOCKED_SHAREDCACHE; 000377 } 000378 } 000379 return SQLITE_OK; 000380 } 000381 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000382 000383 #ifndef SQLITE_OMIT_SHARED_CACHE 000384 /* 000385 ** Add a lock on the table with root-page iTable to the shared-btree used 000386 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 000387 ** WRITE_LOCK. 000388 ** 000389 ** This function assumes the following: 000390 ** 000391 ** (a) The specified Btree object p is connected to a sharable 000392 ** database (one with the BtShared.sharable flag set), and 000393 ** 000394 ** (b) No other Btree objects hold a lock that conflicts 000395 ** with the requested lock (i.e. querySharedCacheTableLock() has 000396 ** already been called and returned SQLITE_OK). 000397 ** 000398 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 000399 ** is returned if a malloc attempt fails. 000400 */ 000401 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ 000402 BtShared *pBt = p->pBt; 000403 BtLock *pLock = 0; 000404 BtLock *pIter; 000405 000406 SHARED_LOCK_TRACE(pBt,"setLock", iTable, eLock); 000407 000408 assert( sqlite3BtreeHoldsMutex(p) ); 000409 assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); 000410 assert( p->db!=0 ); 000411 000412 /* A connection with the read-uncommitted flag set will never try to 000413 ** obtain a read-lock using this function. The only read-lock obtained 000414 ** by a connection in read-uncommitted mode is on the sqlite_schema 000415 ** table, and that lock is obtained in BtreeBeginTrans(). */ 000416 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK ); 000417 000418 /* This function should only be called on a sharable b-tree after it 000419 ** has been determined that no other b-tree holds a conflicting lock. */ 000420 assert( p->sharable ); 000421 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); 000422 000423 /* First search the list for an existing lock on this table. */ 000424 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 000425 if( pIter->iTable==iTable && pIter->pBtree==p ){ 000426 pLock = pIter; 000427 break; 000428 } 000429 } 000430 000431 /* If the above search did not find a BtLock struct associating Btree p 000432 ** with table iTable, allocate one and link it into the list. 000433 */ 000434 if( !pLock ){ 000435 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); 000436 if( !pLock ){ 000437 return SQLITE_NOMEM_BKPT; 000438 } 000439 pLock->iTable = iTable; 000440 pLock->pBtree = p; 000441 pLock->pNext = pBt->pLock; 000442 pBt->pLock = pLock; 000443 } 000444 000445 /* Set the BtLock.eLock variable to the maximum of the current lock 000446 ** and the requested lock. This means if a write-lock was already held 000447 ** and a read-lock requested, we don't incorrectly downgrade the lock. 000448 */ 000449 assert( WRITE_LOCK>READ_LOCK ); 000450 if( eLock>pLock->eLock ){ 000451 pLock->eLock = eLock; 000452 } 000453 000454 return SQLITE_OK; 000455 } 000456 #endif /* !SQLITE_OMIT_SHARED_CACHE */ 000457 000458 #ifndef SQLITE_OMIT_SHARED_CACHE 000459 /* 000460 ** Release all the table locks (locks obtained via calls to 000461 ** the setSharedCacheTableLock() procedure) held by Btree object p. 000462 ** 000463 ** This function assumes that Btree p has an open read or write 000464 ** transaction. If it does not, then the BTS_PENDING flag 000465 ** may be incorrectly cleared. 000466 */ 000467 static void clearAllSharedCacheTableLocks(Btree *p){ 000468 BtShared *pBt = p->pBt; 000469 BtLock **ppIter = &pBt->pLock; 000470 000471 assert( sqlite3BtreeHoldsMutex(p) ); 000472 assert( p->sharable || 0==*ppIter ); 000473 assert( p->inTrans>0 ); 000474 000475 SHARED_LOCK_TRACE(pBt, "clearAllLocks", 0, 0); 000476 000477 while( *ppIter ){ 000478 BtLock *pLock = *ppIter; 000479 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree ); 000480 assert( pLock->pBtree->inTrans>=pLock->eLock ); 000481 if( pLock->pBtree==p ){ 000482 *ppIter = pLock->pNext; 000483 assert( pLock->iTable!=1 || pLock==&p->lock ); 000484 if( pLock->iTable!=1 ){ 000485 sqlite3_free(pLock); 000486 } 000487 }else{ 000488 ppIter = &pLock->pNext; 000489 } 000490 } 000491 000492 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter ); 000493 if( pBt->pWriter==p ){ 000494 pBt->pWriter = 0; 000495 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000496 }else if( pBt->nTransaction==2 ){ 000497 /* This function is called when Btree p is concluding its 000498 ** transaction. If there currently exists a writer, and p is not 000499 ** that writer, then the number of locks held by connections other 000500 ** than the writer must be about to drop to zero. In this case 000501 ** set the BTS_PENDING flag to 0. 000502 ** 000503 ** If there is not currently a writer, then BTS_PENDING must 000504 ** be zero already. So this next line is harmless in that case. 000505 */ 000506 pBt->btsFlags &= ~BTS_PENDING; 000507 } 000508 } 000509 000510 /* 000511 ** This function changes all write-locks held by Btree p into read-locks. 000512 */ 000513 static void downgradeAllSharedCacheTableLocks(Btree *p){ 000514 BtShared *pBt = p->pBt; 000515 000516 SHARED_LOCK_TRACE(pBt, "downgradeLocks", 0, 0); 000517 000518 if( pBt->pWriter==p ){ 000519 BtLock *pLock; 000520 pBt->pWriter = 0; 000521 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING); 000522 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ 000523 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); 000524 pLock->eLock = READ_LOCK; 000525 } 000526 } 000527 } 000528 000529 #endif /* SQLITE_OMIT_SHARED_CACHE */ 000530 000531 static void releasePage(MemPage *pPage); /* Forward reference */ 000532 static void releasePageOne(MemPage *pPage); /* Forward reference */ 000533 static void releasePageNotNull(MemPage *pPage); /* Forward reference */ 000534 000535 /* 000536 ***** This routine is used inside of assert() only **** 000537 ** 000538 ** Verify that the cursor holds the mutex on its BtShared 000539 */ 000540 #ifdef SQLITE_DEBUG 000541 static int cursorHoldsMutex(BtCursor *p){ 000542 return sqlite3_mutex_held(p->pBt->mutex); 000543 } 000544 000545 /* Verify that the cursor and the BtShared agree about what is the current 000546 ** database connetion. This is important in shared-cache mode. If the database 000547 ** connection pointers get out-of-sync, it is possible for routines like 000548 ** btreeInitPage() to reference an stale connection pointer that references a 000549 ** a connection that has already closed. This routine is used inside assert() 000550 ** statements only and for the purpose of double-checking that the btree code 000551 ** does keep the database connection pointers up-to-date. 000552 */ 000553 static int cursorOwnsBtShared(BtCursor *p){ 000554 assert( cursorHoldsMutex(p) ); 000555 return (p->pBtree->db==p->pBt->db); 000556 } 000557 #endif 000558 000559 /* 000560 ** Invalidate the overflow cache of the cursor passed as the first argument. 000561 ** on the shared btree structure pBt. 000562 */ 000563 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl) 000564 000565 /* 000566 ** Invalidate the overflow page-list cache for all cursors opened 000567 ** on the shared btree structure pBt. 000568 */ 000569 static void invalidateAllOverflowCache(BtShared *pBt){ 000570 BtCursor *p; 000571 assert( sqlite3_mutex_held(pBt->mutex) ); 000572 for(p=pBt->pCursor; p; p=p->pNext){ 000573 invalidateOverflowCache(p); 000574 } 000575 } 000576 000577 #ifndef SQLITE_OMIT_INCRBLOB 000578 /* 000579 ** This function is called before modifying the contents of a table 000580 ** to invalidate any incrblob cursors that are open on the 000581 ** row or one of the rows being modified. 000582 ** 000583 ** If argument isClearTable is true, then the entire contents of the 000584 ** table is about to be deleted. In this case invalidate all incrblob 000585 ** cursors open on any row within the table with root-page pgnoRoot. 000586 ** 000587 ** Otherwise, if argument isClearTable is false, then the row with 000588 ** rowid iRow is being replaced or deleted. In this case invalidate 000589 ** only those incrblob cursors open on that specific row. 000590 */ 000591 static void invalidateIncrblobCursors( 000592 Btree *pBtree, /* The database file to check */ 000593 Pgno pgnoRoot, /* The table that might be changing */ 000594 i64 iRow, /* The rowid that might be changing */ 000595 int isClearTable /* True if all rows are being deleted */ 000596 ){ 000597 BtCursor *p; 000598 assert( pBtree->hasIncrblobCur ); 000599 assert( sqlite3BtreeHoldsMutex(pBtree) ); 000600 pBtree->hasIncrblobCur = 0; 000601 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 000602 if( (p->curFlags & BTCF_Incrblob)!=0 ){ 000603 pBtree->hasIncrblobCur = 1; 000604 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){ 000605 p->eState = CURSOR_INVALID; 000606 } 000607 } 000608 } 000609 } 000610 000611 #else 000612 /* Stub function when INCRBLOB is omitted */ 000613 #define invalidateIncrblobCursors(w,x,y,z) 000614 #endif /* SQLITE_OMIT_INCRBLOB */ 000615 000616 /* 000617 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 000618 ** when a page that previously contained data becomes a free-list leaf 000619 ** page. 000620 ** 000621 ** The BtShared.pHasContent bitvec exists to work around an obscure 000622 ** bug caused by the interaction of two useful IO optimizations surrounding 000623 ** free-list leaf pages: 000624 ** 000625 ** 1) When all data is deleted from a page and the page becomes 000626 ** a free-list leaf page, the page is not written to the database 000627 ** (as free-list leaf pages contain no meaningful data). Sometimes 000628 ** such a page is not even journalled (as it will not be modified, 000629 ** why bother journalling it?). 000630 ** 000631 ** 2) When a free-list leaf page is reused, its content is not read 000632 ** from the database or written to the journal file (why should it 000633 ** be, if it is not at all meaningful?). 000634 ** 000635 ** By themselves, these optimizations work fine and provide a handy 000636 ** performance boost to bulk delete or insert operations. However, if 000637 ** a page is moved to the free-list and then reused within the same 000638 ** transaction, a problem comes up. If the page is not journalled when 000639 ** it is moved to the free-list and it is also not journalled when it 000640 ** is extracted from the free-list and reused, then the original data 000641 ** may be lost. In the event of a rollback, it may not be possible 000642 ** to restore the database to its original configuration. 000643 ** 000644 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 000645 ** moved to become a free-list leaf page, the corresponding bit is 000646 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, 000647 ** optimization 2 above is omitted if the corresponding bit is already 000648 ** set in BtShared.pHasContent. The contents of the bitvec are cleared 000649 ** at the end of every transaction. 000650 */ 000651 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ 000652 int rc = SQLITE_OK; 000653 if( !pBt->pHasContent ){ 000654 assert( pgno<=pBt->nPage ); 000655 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage); 000656 if( !pBt->pHasContent ){ 000657 rc = SQLITE_NOMEM_BKPT; 000658 } 000659 } 000660 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ 000661 rc = sqlite3BitvecSet(pBt->pHasContent, pgno); 000662 } 000663 return rc; 000664 } 000665 000666 /* 000667 ** Query the BtShared.pHasContent vector. 000668 ** 000669 ** This function is called when a free-list leaf page is removed from the 000670 ** free-list for reuse. It returns false if it is safe to retrieve the 000671 ** page from the pager layer with the 'no-content' flag set. True otherwise. 000672 */ 000673 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ 000674 Bitvec *p = pBt->pHasContent; 000675 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno)); 000676 } 000677 000678 /* 000679 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be 000680 ** invoked at the conclusion of each write-transaction. 000681 */ 000682 static void btreeClearHasContent(BtShared *pBt){ 000683 sqlite3BitvecDestroy(pBt->pHasContent); 000684 pBt->pHasContent = 0; 000685 } 000686 000687 /* 000688 ** Release all of the apPage[] pages for a cursor. 000689 */ 000690 static void btreeReleaseAllCursorPages(BtCursor *pCur){ 000691 int i; 000692 if( pCur->iPage>=0 ){ 000693 for(i=0; i<pCur->iPage; i++){ 000694 releasePageNotNull(pCur->apPage[i]); 000695 } 000696 releasePageNotNull(pCur->pPage); 000697 pCur->iPage = -1; 000698 } 000699 } 000700 000701 /* 000702 ** The cursor passed as the only argument must point to a valid entry 000703 ** when this function is called (i.e. have eState==CURSOR_VALID). This 000704 ** function saves the current cursor key in variables pCur->nKey and 000705 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 000706 ** code otherwise. 000707 ** 000708 ** If the cursor is open on an intkey table, then the integer key 000709 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to 000710 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 000711 ** set to point to a malloced buffer pCur->nKey bytes in size containing 000712 ** the key. 000713 */ 000714 static int saveCursorKey(BtCursor *pCur){ 000715 int rc = SQLITE_OK; 000716 assert( CURSOR_VALID==pCur->eState ); 000717 assert( 0==pCur->pKey ); 000718 assert( cursorHoldsMutex(pCur) ); 000719 000720 if( pCur->curIntKey ){ 000721 /* Only the rowid is required for a table btree */ 000722 pCur->nKey = sqlite3BtreeIntegerKey(pCur); 000723 }else{ 000724 /* For an index btree, save the complete key content. It is possible 000725 ** that the current key is corrupt. In that case, it is possible that 000726 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by 000727 ** up to the size of 1 varint plus 1 8-byte value when the cursor 000728 ** position is restored. Hence the 17 bytes of padding allocated 000729 ** below. */ 000730 void *pKey; 000731 pCur->nKey = sqlite3BtreePayloadSize(pCur); 000732 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 ); 000733 if( pKey ){ 000734 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey); 000735 if( rc==SQLITE_OK ){ 000736 memset(((u8*)pKey)+pCur->nKey, 0, 9+8); 000737 pCur->pKey = pKey; 000738 }else{ 000739 sqlite3_free(pKey); 000740 } 000741 }else{ 000742 rc = SQLITE_NOMEM_BKPT; 000743 } 000744 } 000745 assert( !pCur->curIntKey || !pCur->pKey ); 000746 return rc; 000747 } 000748 000749 /* 000750 ** Save the current cursor position in the variables BtCursor.nKey 000751 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. 000752 ** 000753 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) 000754 ** prior to calling this routine. 000755 */ 000756 static int saveCursorPosition(BtCursor *pCur){ 000757 int rc; 000758 000759 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState ); 000760 assert( 0==pCur->pKey ); 000761 assert( cursorHoldsMutex(pCur) ); 000762 000763 if( pCur->curFlags & BTCF_Pinned ){ 000764 return SQLITE_CONSTRAINT_PINNED; 000765 } 000766 if( pCur->eState==CURSOR_SKIPNEXT ){ 000767 pCur->eState = CURSOR_VALID; 000768 }else{ 000769 pCur->skipNext = 0; 000770 } 000771 000772 rc = saveCursorKey(pCur); 000773 if( rc==SQLITE_OK ){ 000774 btreeReleaseAllCursorPages(pCur); 000775 pCur->eState = CURSOR_REQUIRESEEK; 000776 } 000777 000778 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast); 000779 return rc; 000780 } 000781 000782 /* Forward reference */ 000783 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*); 000784 000785 /* 000786 ** Save the positions of all cursors (except pExcept) that are open on 000787 ** the table with root-page iRoot. "Saving the cursor position" means that 000788 ** the location in the btree is remembered in such a way that it can be 000789 ** moved back to the same spot after the btree has been modified. This 000790 ** routine is called just before cursor pExcept is used to modify the 000791 ** table, for example in BtreeDelete() or BtreeInsert(). 000792 ** 000793 ** If there are two or more cursors on the same btree, then all such 000794 ** cursors should have their BTCF_Multiple flag set. The btreeCursor() 000795 ** routine enforces that rule. This routine only needs to be called in 000796 ** the uncommon case when pExpect has the BTCF_Multiple flag set. 000797 ** 000798 ** If pExpect!=NULL and if no other cursors are found on the same root-page, 000799 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another 000800 ** pointless call to this routine. 000801 ** 000802 ** Implementation note: This routine merely checks to see if any cursors 000803 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual) 000804 ** event that cursors are in need to being saved. 000805 */ 000806 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ 000807 BtCursor *p; 000808 assert( sqlite3_mutex_held(pBt->mutex) ); 000809 assert( pExcept==0 || pExcept->pBt==pBt ); 000810 for(p=pBt->pCursor; p; p=p->pNext){ 000811 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break; 000812 } 000813 if( p ) return saveCursorsOnList(p, iRoot, pExcept); 000814 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple; 000815 return SQLITE_OK; 000816 } 000817 000818 /* This helper routine to saveAllCursors does the actual work of saving 000819 ** the cursors if and when a cursor is found that actually requires saving. 000820 ** The common case is that no cursors need to be saved, so this routine is 000821 ** broken out from its caller to avoid unnecessary stack pointer movement. 000822 */ 000823 static int SQLITE_NOINLINE saveCursorsOnList( 000824 BtCursor *p, /* The first cursor that needs saving */ 000825 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */ 000826 BtCursor *pExcept /* Do not save this cursor */ 000827 ){ 000828 do{ 000829 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){ 000830 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 000831 int rc = saveCursorPosition(p); 000832 if( SQLITE_OK!=rc ){ 000833 return rc; 000834 } 000835 }else{ 000836 testcase( p->iPage>=0 ); 000837 btreeReleaseAllCursorPages(p); 000838 } 000839 } 000840 p = p->pNext; 000841 }while( p ); 000842 return SQLITE_OK; 000843 } 000844 000845 /* 000846 ** Clear the current cursor position. 000847 */ 000848 void sqlite3BtreeClearCursor(BtCursor *pCur){ 000849 assert( cursorHoldsMutex(pCur) ); 000850 sqlite3_free(pCur->pKey); 000851 pCur->pKey = 0; 000852 pCur->eState = CURSOR_INVALID; 000853 } 000854 000855 /* 000856 ** In this version of BtreeMoveto, pKey is a packed index record 000857 ** such as is generated by the OP_MakeRecord opcode. Unpack the 000858 ** record and then call sqlite3BtreeIndexMoveto() to do the work. 000859 */ 000860 static int btreeMoveto( 000861 BtCursor *pCur, /* Cursor open on the btree to be searched */ 000862 const void *pKey, /* Packed key if the btree is an index */ 000863 i64 nKey, /* Integer key for tables. Size of pKey for indices */ 000864 int bias, /* Bias search to the high end */ 000865 int *pRes /* Write search results here */ 000866 ){ 000867 int rc; /* Status code */ 000868 UnpackedRecord *pIdxKey; /* Unpacked index key */ 000869 000870 if( pKey ){ 000871 KeyInfo *pKeyInfo = pCur->pKeyInfo; 000872 assert( nKey==(i64)(int)nKey ); 000873 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo); 000874 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT; 000875 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey); 000876 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){ 000877 rc = SQLITE_CORRUPT_BKPT; 000878 }else{ 000879 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes); 000880 } 000881 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey); 000882 }else{ 000883 pIdxKey = 0; 000884 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes); 000885 } 000886 return rc; 000887 } 000888 000889 /* 000890 ** Restore the cursor to the position it was in (or as close to as possible) 000891 ** when saveCursorPosition() was called. Note that this call deletes the 000892 ** saved position info stored by saveCursorPosition(), so there can be 000893 ** at most one effective restoreCursorPosition() call after each 000894 ** saveCursorPosition(). 000895 */ 000896 static int btreeRestoreCursorPosition(BtCursor *pCur){ 000897 int rc; 000898 int skipNext = 0; 000899 assert( cursorOwnsBtShared(pCur) ); 000900 assert( pCur->eState>=CURSOR_REQUIRESEEK ); 000901 if( pCur->eState==CURSOR_FAULT ){ 000902 return pCur->skipNext; 000903 } 000904 pCur->eState = CURSOR_INVALID; 000905 if( sqlite3FaultSim(410) ){ 000906 rc = SQLITE_IOERR; 000907 }else{ 000908 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext); 000909 } 000910 if( rc==SQLITE_OK ){ 000911 sqlite3_free(pCur->pKey); 000912 pCur->pKey = 0; 000913 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); 000914 if( skipNext ) pCur->skipNext = skipNext; 000915 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){ 000916 pCur->eState = CURSOR_SKIPNEXT; 000917 } 000918 } 000919 return rc; 000920 } 000921 000922 #define restoreCursorPosition(p) \ 000923 (p->eState>=CURSOR_REQUIRESEEK ? \ 000924 btreeRestoreCursorPosition(p) : \ 000925 SQLITE_OK) 000926 000927 /* 000928 ** Determine whether or not a cursor has moved from the position where 000929 ** it was last placed, or has been invalidated for any other reason. 000930 ** Cursors can move when the row they are pointing at is deleted out 000931 ** from under them, for example. Cursor might also move if a btree 000932 ** is rebalanced. 000933 ** 000934 ** Calling this routine with a NULL cursor pointer returns false. 000935 ** 000936 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor 000937 ** back to where it ought to be if this routine returns true. 000938 */ 000939 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){ 000940 assert( EIGHT_BYTE_ALIGNMENT(pCur) 000941 || pCur==sqlite3BtreeFakeValidCursor() ); 000942 assert( offsetof(BtCursor, eState)==0 ); 000943 assert( sizeof(pCur->eState)==1 ); 000944 return CURSOR_VALID != *(u8*)pCur; 000945 } 000946 000947 /* 000948 ** Return a pointer to a fake BtCursor object that will always answer 000949 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake 000950 ** cursor returned must not be used with any other Btree interface. 000951 */ 000952 BtCursor *sqlite3BtreeFakeValidCursor(void){ 000953 static u8 fakeCursor = CURSOR_VALID; 000954 assert( offsetof(BtCursor, eState)==0 ); 000955 return (BtCursor*)&fakeCursor; 000956 } 000957 000958 /* 000959 ** This routine restores a cursor back to its original position after it 000960 ** has been moved by some outside activity (such as a btree rebalance or 000961 ** a row having been deleted out from under the cursor). 000962 ** 000963 ** On success, the *pDifferentRow parameter is false if the cursor is left 000964 ** pointing at exactly the same row. *pDifferntRow is the row the cursor 000965 ** was pointing to has been deleted, forcing the cursor to point to some 000966 ** nearby row. 000967 ** 000968 ** This routine should only be called for a cursor that just returned 000969 ** TRUE from sqlite3BtreeCursorHasMoved(). 000970 */ 000971 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){ 000972 int rc; 000973 000974 assert( pCur!=0 ); 000975 assert( pCur->eState!=CURSOR_VALID ); 000976 rc = restoreCursorPosition(pCur); 000977 if( rc ){ 000978 *pDifferentRow = 1; 000979 return rc; 000980 } 000981 if( pCur->eState!=CURSOR_VALID ){ 000982 *pDifferentRow = 1; 000983 }else{ 000984 *pDifferentRow = 0; 000985 } 000986 return SQLITE_OK; 000987 } 000988 000989 #ifdef SQLITE_ENABLE_CURSOR_HINTS 000990 /* 000991 ** Provide hints to the cursor. The particular hint given (and the type 000992 ** and number of the varargs parameters) is determined by the eHintType 000993 ** parameter. See the definitions of the BTREE_HINT_* macros for details. 000994 */ 000995 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){ 000996 /* Used only by system that substitute their own storage engine */ 000997 #ifdef SQLITE_DEBUG 000998 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){ 000999 va_list ap; 001000 Expr *pExpr; 001001 Walker w; 001002 memset(&w, 0, sizeof(w)); 001003 w.xExprCallback = sqlite3CursorRangeHintExprCheck; 001004 va_start(ap, eHintType); 001005 pExpr = va_arg(ap, Expr*); 001006 w.u.aMem = va_arg(ap, Mem*); 001007 va_end(ap); 001008 assert( pExpr!=0 ); 001009 assert( w.u.aMem!=0 ); 001010 sqlite3WalkExpr(&w, pExpr); 001011 } 001012 #endif /* SQLITE_DEBUG */ 001013 } 001014 #endif /* SQLITE_ENABLE_CURSOR_HINTS */ 001015 001016 001017 /* 001018 ** Provide flag hints to the cursor. 001019 */ 001020 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){ 001021 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 ); 001022 pCur->hints = x; 001023 } 001024 001025 001026 #ifndef SQLITE_OMIT_AUTOVACUUM 001027 /* 001028 ** Given a page number of a regular database page, return the page 001029 ** number for the pointer-map page that contains the entry for the 001030 ** input page number. 001031 ** 001032 ** Return 0 (not a valid page) for pgno==1 since there is 001033 ** no pointer map associated with page 1. The integrity_check logic 001034 ** requires that ptrmapPageno(*,1)!=1. 001035 */ 001036 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ 001037 int nPagesPerMapPage; 001038 Pgno iPtrMap, ret; 001039 assert( sqlite3_mutex_held(pBt->mutex) ); 001040 if( pgno<2 ) return 0; 001041 nPagesPerMapPage = (pBt->usableSize/5)+1; 001042 iPtrMap = (pgno-2)/nPagesPerMapPage; 001043 ret = (iPtrMap*nPagesPerMapPage) + 2; 001044 if( ret==PENDING_BYTE_PAGE(pBt) ){ 001045 ret++; 001046 } 001047 return ret; 001048 } 001049 001050 /* 001051 ** Write an entry into the pointer map. 001052 ** 001053 ** This routine updates the pointer map entry for page number 'key' 001054 ** so that it maps to type 'eType' and parent page number 'pgno'. 001055 ** 001056 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is 001057 ** a no-op. If an error occurs, the appropriate error code is written 001058 ** into *pRC. 001059 */ 001060 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ 001061 DbPage *pDbPage; /* The pointer map page */ 001062 u8 *pPtrmap; /* The pointer map data */ 001063 Pgno iPtrmap; /* The pointer map page number */ 001064 int offset; /* Offset in pointer map page */ 001065 int rc; /* Return code from subfunctions */ 001066 001067 if( *pRC ) return; 001068 001069 assert( sqlite3_mutex_held(pBt->mutex) ); 001070 /* The super-journal page number must never be used as a pointer map page */ 001071 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); 001072 001073 assert( pBt->autoVacuum ); 001074 if( key==0 ){ 001075 *pRC = SQLITE_CORRUPT_BKPT; 001076 return; 001077 } 001078 iPtrmap = PTRMAP_PAGENO(pBt, key); 001079 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001080 if( rc!=SQLITE_OK ){ 001081 *pRC = rc; 001082 return; 001083 } 001084 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){ 001085 /* The first byte of the extra data is the MemPage.isInit byte. 001086 ** If that byte is set, it means this page is also being used 001087 ** as a btree page. */ 001088 *pRC = SQLITE_CORRUPT_BKPT; 001089 goto ptrmap_exit; 001090 } 001091 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001092 if( offset<0 ){ 001093 *pRC = SQLITE_CORRUPT_BKPT; 001094 goto ptrmap_exit; 001095 } 001096 assert( offset <= (int)pBt->usableSize-5 ); 001097 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001098 001099 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ 001100 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent)); 001101 *pRC= rc = sqlite3PagerWrite(pDbPage); 001102 if( rc==SQLITE_OK ){ 001103 pPtrmap[offset] = eType; 001104 put4byte(&pPtrmap[offset+1], parent); 001105 } 001106 } 001107 001108 ptrmap_exit: 001109 sqlite3PagerUnref(pDbPage); 001110 } 001111 001112 /* 001113 ** Read an entry from the pointer map. 001114 ** 001115 ** This routine retrieves the pointer map entry for page 'key', writing 001116 ** the type and parent page number to *pEType and *pPgno respectively. 001117 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. 001118 */ 001119 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ 001120 DbPage *pDbPage; /* The pointer map page */ 001121 int iPtrmap; /* Pointer map page index */ 001122 u8 *pPtrmap; /* Pointer map page data */ 001123 int offset; /* Offset of entry in pointer map */ 001124 int rc; 001125 001126 assert( sqlite3_mutex_held(pBt->mutex) ); 001127 001128 iPtrmap = PTRMAP_PAGENO(pBt, key); 001129 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0); 001130 if( rc!=0 ){ 001131 return rc; 001132 } 001133 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); 001134 001135 offset = PTRMAP_PTROFFSET(iPtrmap, key); 001136 if( offset<0 ){ 001137 sqlite3PagerUnref(pDbPage); 001138 return SQLITE_CORRUPT_BKPT; 001139 } 001140 assert( offset <= (int)pBt->usableSize-5 ); 001141 assert( pEType!=0 ); 001142 *pEType = pPtrmap[offset]; 001143 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); 001144 001145 sqlite3PagerUnref(pDbPage); 001146 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap); 001147 return SQLITE_OK; 001148 } 001149 001150 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ 001151 #define ptrmapPut(w,x,y,z,rc) 001152 #define ptrmapGet(w,x,y,z) SQLITE_OK 001153 #define ptrmapPutOvflPtr(x, y, z, rc) 001154 #endif 001155 001156 /* 001157 ** Given a btree page and a cell index (0 means the first cell on 001158 ** the page, 1 means the second cell, and so forth) return a pointer 001159 ** to the cell content. 001160 ** 001161 ** findCellPastPtr() does the same except it skips past the initial 001162 ** 4-byte child pointer found on interior pages, if there is one. 001163 ** 001164 ** This routine works only for pages that do not contain overflow cells. 001165 */ 001166 #define findCell(P,I) \ 001167 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001168 #define findCellPastPtr(P,I) \ 001169 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)]))) 001170 001171 001172 /* 001173 ** This is common tail processing for btreeParseCellPtr() and 001174 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely 001175 ** on a single B-tree page. Make necessary adjustments to the CellInfo 001176 ** structure. 001177 */ 001178 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow( 001179 MemPage *pPage, /* Page containing the cell */ 001180 u8 *pCell, /* Pointer to the cell text. */ 001181 CellInfo *pInfo /* Fill in this structure */ 001182 ){ 001183 /* If the payload will not fit completely on the local page, we have 001184 ** to decide how much to store locally and how much to spill onto 001185 ** overflow pages. The strategy is to minimize the amount of unused 001186 ** space on overflow pages while keeping the amount of local storage 001187 ** in between minLocal and maxLocal. 001188 ** 001189 ** Warning: changing the way overflow payload is distributed in any 001190 ** way will result in an incompatible file format. 001191 */ 001192 int minLocal; /* Minimum amount of payload held locally */ 001193 int maxLocal; /* Maximum amount of payload held locally */ 001194 int surplus; /* Overflow payload available for local storage */ 001195 001196 minLocal = pPage->minLocal; 001197 maxLocal = pPage->maxLocal; 001198 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4); 001199 testcase( surplus==maxLocal ); 001200 testcase( surplus==maxLocal+1 ); 001201 if( surplus <= maxLocal ){ 001202 pInfo->nLocal = (u16)surplus; 001203 }else{ 001204 pInfo->nLocal = (u16)minLocal; 001205 } 001206 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4; 001207 } 001208 001209 /* 001210 ** Given a record with nPayload bytes of payload stored within btree 001211 ** page pPage, return the number of bytes of payload stored locally. 001212 */ 001213 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){ 001214 int maxLocal; /* Maximum amount of payload held locally */ 001215 maxLocal = pPage->maxLocal; 001216 if( nPayload<=maxLocal ){ 001217 return nPayload; 001218 }else{ 001219 int minLocal; /* Minimum amount of payload held locally */ 001220 int surplus; /* Overflow payload available for local storage */ 001221 minLocal = pPage->minLocal; 001222 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4); 001223 return ( surplus <= maxLocal ) ? surplus : minLocal; 001224 } 001225 } 001226 001227 /* 001228 ** The following routines are implementations of the MemPage.xParseCell() 001229 ** method. 001230 ** 001231 ** Parse a cell content block and fill in the CellInfo structure. 001232 ** 001233 ** btreeParseCellPtr() => table btree leaf nodes 001234 ** btreeParseCellNoPayload() => table btree internal nodes 001235 ** btreeParseCellPtrIndex() => index btree nodes 001236 ** 001237 ** There is also a wrapper function btreeParseCell() that works for 001238 ** all MemPage types and that references the cell by index rather than 001239 ** by pointer. 001240 */ 001241 static void btreeParseCellPtrNoPayload( 001242 MemPage *pPage, /* Page containing the cell */ 001243 u8 *pCell, /* Pointer to the cell text. */ 001244 CellInfo *pInfo /* Fill in this structure */ 001245 ){ 001246 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001247 assert( pPage->leaf==0 ); 001248 assert( pPage->childPtrSize==4 ); 001249 #ifndef SQLITE_DEBUG 001250 UNUSED_PARAMETER(pPage); 001251 #endif 001252 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey); 001253 pInfo->nPayload = 0; 001254 pInfo->nLocal = 0; 001255 pInfo->pPayload = 0; 001256 return; 001257 } 001258 static void btreeParseCellPtr( 001259 MemPage *pPage, /* Page containing the cell */ 001260 u8 *pCell, /* Pointer to the cell text. */ 001261 CellInfo *pInfo /* Fill in this structure */ 001262 ){ 001263 u8 *pIter; /* For scanning through pCell */ 001264 u32 nPayload; /* Number of bytes of cell payload */ 001265 u64 iKey; /* Extracted Key value */ 001266 001267 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001268 assert( pPage->leaf==0 || pPage->leaf==1 ); 001269 assert( pPage->intKeyLeaf ); 001270 assert( pPage->childPtrSize==0 ); 001271 pIter = pCell; 001272 001273 /* The next block of code is equivalent to: 001274 ** 001275 ** pIter += getVarint32(pIter, nPayload); 001276 ** 001277 ** The code is inlined to avoid a function call. 001278 */ 001279 nPayload = *pIter; 001280 if( nPayload>=0x80 ){ 001281 u8 *pEnd = &pIter[8]; 001282 nPayload &= 0x7f; 001283 do{ 001284 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001285 }while( (*pIter)>=0x80 && pIter<pEnd ); 001286 } 001287 pIter++; 001288 001289 /* The next block of code is equivalent to: 001290 ** 001291 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey); 001292 ** 001293 ** The code is inlined and the loop is unrolled for performance. 001294 ** This routine is a high-runner. 001295 */ 001296 iKey = *pIter; 001297 if( iKey>=0x80 ){ 001298 u8 x; 001299 iKey = (iKey<<7) ^ (x = *++pIter); 001300 if( x>=0x80 ){ 001301 iKey = (iKey<<7) ^ (x = *++pIter); 001302 if( x>=0x80 ){ 001303 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter); 001304 if( x>=0x80 ){ 001305 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001306 if( x>=0x80 ){ 001307 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001308 if( x>=0x80 ){ 001309 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001310 if( x>=0x80 ){ 001311 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter); 001312 if( x>=0x80 ){ 001313 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter); 001314 } 001315 } 001316 } 001317 } 001318 } 001319 }else{ 001320 iKey ^= 0x204000; 001321 } 001322 }else{ 001323 iKey ^= 0x4000; 001324 } 001325 } 001326 pIter++; 001327 001328 pInfo->nKey = *(i64*)&iKey; 001329 pInfo->nPayload = nPayload; 001330 pInfo->pPayload = pIter; 001331 testcase( nPayload==pPage->maxLocal ); 001332 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001333 if( nPayload<=pPage->maxLocal ){ 001334 /* This is the (easy) common case where the entire payload fits 001335 ** on the local page. No overflow is required. 001336 */ 001337 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001338 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001339 pInfo->nLocal = (u16)nPayload; 001340 }else{ 001341 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001342 } 001343 } 001344 static void btreeParseCellPtrIndex( 001345 MemPage *pPage, /* Page containing the cell */ 001346 u8 *pCell, /* Pointer to the cell text. */ 001347 CellInfo *pInfo /* Fill in this structure */ 001348 ){ 001349 u8 *pIter; /* For scanning through pCell */ 001350 u32 nPayload; /* Number of bytes of cell payload */ 001351 001352 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001353 assert( pPage->leaf==0 || pPage->leaf==1 ); 001354 assert( pPage->intKeyLeaf==0 ); 001355 pIter = pCell + pPage->childPtrSize; 001356 nPayload = *pIter; 001357 if( nPayload>=0x80 ){ 001358 u8 *pEnd = &pIter[8]; 001359 nPayload &= 0x7f; 001360 do{ 001361 nPayload = (nPayload<<7) | (*++pIter & 0x7f); 001362 }while( *(pIter)>=0x80 && pIter<pEnd ); 001363 } 001364 pIter++; 001365 pInfo->nKey = nPayload; 001366 pInfo->nPayload = nPayload; 001367 pInfo->pPayload = pIter; 001368 testcase( nPayload==pPage->maxLocal ); 001369 testcase( nPayload==(u32)pPage->maxLocal+1 ); 001370 if( nPayload<=pPage->maxLocal ){ 001371 /* This is the (easy) common case where the entire payload fits 001372 ** on the local page. No overflow is required. 001373 */ 001374 pInfo->nSize = nPayload + (u16)(pIter - pCell); 001375 if( pInfo->nSize<4 ) pInfo->nSize = 4; 001376 pInfo->nLocal = (u16)nPayload; 001377 }else{ 001378 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo); 001379 } 001380 } 001381 static void btreeParseCell( 001382 MemPage *pPage, /* Page containing the cell */ 001383 int iCell, /* The cell index. First cell is 0 */ 001384 CellInfo *pInfo /* Fill in this structure */ 001385 ){ 001386 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo); 001387 } 001388 001389 /* 001390 ** The following routines are implementations of the MemPage.xCellSize 001391 ** method. 001392 ** 001393 ** Compute the total number of bytes that a Cell needs in the cell 001394 ** data area of the btree-page. The return number includes the cell 001395 ** data header and the local payload, but not any overflow page or 001396 ** the space used by the cell pointer. 001397 ** 001398 ** cellSizePtrNoPayload() => table internal nodes 001399 ** cellSizePtrTableLeaf() => table leaf nodes 001400 ** cellSizePtr() => index internal nodes 001401 ** cellSizeIdxLeaf() => index leaf nodes 001402 */ 001403 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ 001404 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001405 u8 *pEnd; /* End mark for a varint */ 001406 u32 nSize; /* Size value to return */ 001407 001408 #ifdef SQLITE_DEBUG 001409 /* The value returned by this function should always be the same as 001410 ** the (CellInfo.nSize) value found by doing a full parse of the 001411 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001412 ** this function verifies that this invariant is not violated. */ 001413 CellInfo debuginfo; 001414 pPage->xParseCell(pPage, pCell, &debuginfo); 001415 #endif 001416 001417 assert( pPage->childPtrSize==4 ); 001418 nSize = *pIter; 001419 if( nSize>=0x80 ){ 001420 pEnd = &pIter[8]; 001421 nSize &= 0x7f; 001422 do{ 001423 nSize = (nSize<<7) | (*++pIter & 0x7f); 001424 }while( *(pIter)>=0x80 && pIter<pEnd ); 001425 } 001426 pIter++; 001427 testcase( nSize==pPage->maxLocal ); 001428 testcase( nSize==(u32)pPage->maxLocal+1 ); 001429 if( nSize<=pPage->maxLocal ){ 001430 nSize += (u32)(pIter - pCell); 001431 assert( nSize>4 ); 001432 }else{ 001433 int minLocal = pPage->minLocal; 001434 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001435 testcase( nSize==pPage->maxLocal ); 001436 testcase( nSize==(u32)pPage->maxLocal+1 ); 001437 if( nSize>pPage->maxLocal ){ 001438 nSize = minLocal; 001439 } 001440 nSize += 4 + (u16)(pIter - pCell); 001441 } 001442 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001443 return (u16)nSize; 001444 } 001445 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){ 001446 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001447 u8 *pEnd; /* End mark for a varint */ 001448 u32 nSize; /* Size value to return */ 001449 001450 #ifdef SQLITE_DEBUG 001451 /* The value returned by this function should always be the same as 001452 ** the (CellInfo.nSize) value found by doing a full parse of the 001453 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001454 ** this function verifies that this invariant is not violated. */ 001455 CellInfo debuginfo; 001456 pPage->xParseCell(pPage, pCell, &debuginfo); 001457 #endif 001458 001459 assert( pPage->childPtrSize==0 ); 001460 nSize = *pIter; 001461 if( nSize>=0x80 ){ 001462 pEnd = &pIter[8]; 001463 nSize &= 0x7f; 001464 do{ 001465 nSize = (nSize<<7) | (*++pIter & 0x7f); 001466 }while( *(pIter)>=0x80 && pIter<pEnd ); 001467 } 001468 pIter++; 001469 testcase( nSize==pPage->maxLocal ); 001470 testcase( nSize==(u32)pPage->maxLocal+1 ); 001471 if( nSize<=pPage->maxLocal ){ 001472 nSize += (u32)(pIter - pCell); 001473 if( nSize<4 ) nSize = 4; 001474 }else{ 001475 int minLocal = pPage->minLocal; 001476 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001477 testcase( nSize==pPage->maxLocal ); 001478 testcase( nSize==(u32)pPage->maxLocal+1 ); 001479 if( nSize>pPage->maxLocal ){ 001480 nSize = minLocal; 001481 } 001482 nSize += 4 + (u16)(pIter - pCell); 001483 } 001484 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001485 return (u16)nSize; 001486 } 001487 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ 001488 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */ 001489 u8 *pEnd; /* End mark for a varint */ 001490 001491 #ifdef SQLITE_DEBUG 001492 /* The value returned by this function should always be the same as 001493 ** the (CellInfo.nSize) value found by doing a full parse of the 001494 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001495 ** this function verifies that this invariant is not violated. */ 001496 CellInfo debuginfo; 001497 pPage->xParseCell(pPage, pCell, &debuginfo); 001498 #else 001499 UNUSED_PARAMETER(pPage); 001500 #endif 001501 001502 assert( pPage->childPtrSize==4 ); 001503 pEnd = pIter + 9; 001504 while( (*pIter++)&0x80 && pIter<pEnd ); 001505 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); 001506 return (u16)(pIter - pCell); 001507 } 001508 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ 001509 u8 *pIter = pCell; /* For looping over bytes of pCell */ 001510 u8 *pEnd; /* End mark for a varint */ 001511 u32 nSize; /* Size value to return */ 001512 001513 #ifdef SQLITE_DEBUG 001514 /* The value returned by this function should always be the same as 001515 ** the (CellInfo.nSize) value found by doing a full parse of the 001516 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of 001517 ** this function verifies that this invariant is not violated. */ 001518 CellInfo debuginfo; 001519 pPage->xParseCell(pPage, pCell, &debuginfo); 001520 #endif 001521 001522 nSize = *pIter; 001523 if( nSize>=0x80 ){ 001524 pEnd = &pIter[8]; 001525 nSize &= 0x7f; 001526 do{ 001527 nSize = (nSize<<7) | (*++pIter & 0x7f); 001528 }while( *(pIter)>=0x80 && pIter<pEnd ); 001529 } 001530 pIter++; 001531 /* pIter now points at the 64-bit integer key value, a variable length 001532 ** integer. The following block moves pIter to point at the first byte 001533 ** past the end of the key value. */ 001534 if( (*pIter++)&0x80 001535 && (*pIter++)&0x80 001536 && (*pIter++)&0x80 001537 && (*pIter++)&0x80 001538 && (*pIter++)&0x80 001539 && (*pIter++)&0x80 001540 && (*pIter++)&0x80 001541 && (*pIter++)&0x80 ){ pIter++; } 001542 testcase( nSize==pPage->maxLocal ); 001543 testcase( nSize==(u32)pPage->maxLocal+1 ); 001544 if( nSize<=pPage->maxLocal ){ 001545 nSize += (u32)(pIter - pCell); 001546 if( nSize<4 ) nSize = 4; 001547 }else{ 001548 int minLocal = pPage->minLocal; 001549 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); 001550 testcase( nSize==pPage->maxLocal ); 001551 testcase( nSize==(u32)pPage->maxLocal+1 ); 001552 if( nSize>pPage->maxLocal ){ 001553 nSize = minLocal; 001554 } 001555 nSize += 4 + (u16)(pIter - pCell); 001556 } 001557 assert( nSize==debuginfo.nSize || CORRUPT_DB ); 001558 return (u16)nSize; 001559 } 001560 001561 001562 #ifdef SQLITE_DEBUG 001563 /* This variation on cellSizePtr() is used inside of assert() statements 001564 ** only. */ 001565 static u16 cellSize(MemPage *pPage, int iCell){ 001566 return pPage->xCellSize(pPage, findCell(pPage, iCell)); 001567 } 001568 #endif 001569 001570 #ifndef SQLITE_OMIT_AUTOVACUUM 001571 /* 001572 ** The cell pCell is currently part of page pSrc but will ultimately be part 001573 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a 001574 ** pointer to an overflow page, insert an entry into the pointer-map for 001575 ** the overflow page that will be valid after pCell has been moved to pPage. 001576 */ 001577 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){ 001578 CellInfo info; 001579 if( *pRC ) return; 001580 assert( pCell!=0 ); 001581 pPage->xParseCell(pPage, pCell, &info); 001582 if( info.nLocal<info.nPayload ){ 001583 Pgno ovfl; 001584 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){ 001585 testcase( pSrc!=pPage ); 001586 *pRC = SQLITE_CORRUPT_BKPT; 001587 return; 001588 } 001589 ovfl = get4byte(&pCell[info.nSize-4]); 001590 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); 001591 } 001592 } 001593 #endif 001594 001595 001596 /* 001597 ** Defragment the page given. This routine reorganizes cells within the 001598 ** page so that there are no free-blocks on the free-block list. 001599 ** 001600 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be 001601 ** present in the page after this routine returns. 001602 ** 001603 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a 001604 ** b-tree page so that there are no freeblocks or fragment bytes, all 001605 ** unused bytes are contained in the unallocated space region, and all 001606 ** cells are packed tightly at the end of the page. 001607 */ 001608 static int defragmentPage(MemPage *pPage, int nMaxFrag){ 001609 int i; /* Loop counter */ 001610 int pc; /* Address of the i-th cell */ 001611 int hdr; /* Offset to the page header */ 001612 int size; /* Size of a cell */ 001613 int usableSize; /* Number of usable bytes on a page */ 001614 int cellOffset; /* Offset to the cell pointer array */ 001615 int cbrk; /* Offset to the cell content area */ 001616 int nCell; /* Number of cells on the page */ 001617 unsigned char *data; /* The page data */ 001618 unsigned char *temp; /* Temp area for cell content */ 001619 unsigned char *src; /* Source of content */ 001620 int iCellFirst; /* First allowable cell index */ 001621 int iCellLast; /* Last possible cell index */ 001622 int iCellStart; /* First cell offset in input */ 001623 001624 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001625 assert( pPage->pBt!=0 ); 001626 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); 001627 assert( pPage->nOverflow==0 ); 001628 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001629 data = pPage->aData; 001630 hdr = pPage->hdrOffset; 001631 cellOffset = pPage->cellOffset; 001632 nCell = pPage->nCell; 001633 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB ); 001634 iCellFirst = cellOffset + 2*nCell; 001635 usableSize = pPage->pBt->usableSize; 001636 001637 /* This block handles pages with two or fewer free blocks and nMaxFrag 001638 ** or fewer fragmented bytes. In this case it is faster to move the 001639 ** two (or one) blocks of cells using memmove() and add the required 001640 ** offsets to each pointer in the cell-pointer array than it is to 001641 ** reconstruct the entire page. */ 001642 if( (int)data[hdr+7]<=nMaxFrag ){ 001643 int iFree = get2byte(&data[hdr+1]); 001644 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001645 if( iFree ){ 001646 int iFree2 = get2byte(&data[iFree]); 001647 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage); 001648 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){ 001649 u8 *pEnd = &data[cellOffset + nCell*2]; 001650 u8 *pAddr; 001651 int sz2 = 0; 001652 int sz = get2byte(&data[iFree+2]); 001653 int top = get2byte(&data[hdr+5]); 001654 if( top>=iFree ){ 001655 return SQLITE_CORRUPT_PAGE(pPage); 001656 } 001657 if( iFree2 ){ 001658 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage); 001659 sz2 = get2byte(&data[iFree2+2]); 001660 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); 001661 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); 001662 sz += sz2; 001663 }else if( iFree+sz>usableSize ){ 001664 return SQLITE_CORRUPT_PAGE(pPage); 001665 } 001666 001667 cbrk = top+sz; 001668 assert( cbrk+(iFree-top) <= usableSize ); 001669 memmove(&data[cbrk], &data[top], iFree-top); 001670 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){ 001671 pc = get2byte(pAddr); 001672 if( pc<iFree ){ put2byte(pAddr, pc+sz); } 001673 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); } 001674 } 001675 goto defragment_out; 001676 } 001677 } 001678 } 001679 001680 cbrk = usableSize; 001681 iCellLast = usableSize - 4; 001682 iCellStart = get2byte(&data[hdr+5]); 001683 if( nCell>0 ){ 001684 temp = sqlite3PagerTempSpace(pPage->pBt->pPager); 001685 memcpy(temp, data, usableSize); 001686 src = temp; 001687 for(i=0; i<nCell; i++){ 001688 u8 *pAddr; /* The i-th cell pointer */ 001689 pAddr = &data[cellOffset + i*2]; 001690 pc = get2byte(pAddr); 001691 testcase( pc==iCellFirst ); 001692 testcase( pc==iCellLast ); 001693 /* These conditions have already been verified in btreeInitPage() 001694 ** if PRAGMA cell_size_check=ON. 001695 */ 001696 if( pc>iCellLast ){ 001697 return SQLITE_CORRUPT_PAGE(pPage); 001698 } 001699 assert( pc>=0 && pc<=iCellLast ); 001700 size = pPage->xCellSize(pPage, &src[pc]); 001701 cbrk -= size; 001702 if( cbrk<iCellStart || pc+size>usableSize ){ 001703 return SQLITE_CORRUPT_PAGE(pPage); 001704 } 001705 assert( cbrk+size<=usableSize && cbrk>=iCellStart ); 001706 testcase( cbrk+size==usableSize ); 001707 testcase( pc+size==usableSize ); 001708 put2byte(pAddr, cbrk); 001709 memcpy(&data[cbrk], &src[pc], size); 001710 } 001711 } 001712 data[hdr+7] = 0; 001713 001714 defragment_out: 001715 assert( pPage->nFree>=0 ); 001716 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ 001717 return SQLITE_CORRUPT_PAGE(pPage); 001718 } 001719 assert( cbrk>=iCellFirst ); 001720 put2byte(&data[hdr+5], cbrk); 001721 data[hdr+1] = 0; 001722 data[hdr+2] = 0; 001723 memset(&data[iCellFirst], 0, cbrk-iCellFirst); 001724 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001725 return SQLITE_OK; 001726 } 001727 001728 /* 001729 ** Search the free-list on page pPg for space to store a cell nByte bytes in 001730 ** size. If one can be found, return a pointer to the space and remove it 001731 ** from the free-list. 001732 ** 001733 ** If no suitable space can be found on the free-list, return NULL. 001734 ** 001735 ** This function may detect corruption within pPg. If corruption is 001736 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned. 001737 ** 001738 ** Slots on the free list that are between 1 and 3 bytes larger than nByte 001739 ** will be ignored if adding the extra space to the fragmentation count 001740 ** causes the fragmentation count to exceed 60. 001741 */ 001742 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ 001743 const int hdr = pPg->hdrOffset; /* Offset to page header */ 001744 u8 * const aData = pPg->aData; /* Page data */ 001745 int iAddr = hdr + 1; /* Address of ptr to pc */ 001746 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ 001747 int pc = get2byte(pTmp); /* Address of a free slot */ 001748 int x; /* Excess size of the slot */ 001749 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ 001750 int size; /* Size of the free slot */ 001751 001752 assert( pc>0 ); 001753 while( pc<=maxPC ){ 001754 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each 001755 ** freeblock form a big-endian integer which is the size of the freeblock 001756 ** in bytes, including the 4-byte header. */ 001757 pTmp = &aData[pc+2]; 001758 size = get2byte(pTmp); 001759 if( (x = size - nByte)>=0 ){ 001760 testcase( x==4 ); 001761 testcase( x==3 ); 001762 if( x<4 ){ 001763 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total 001764 ** number of bytes in fragments may not exceed 60. */ 001765 if( aData[hdr+7]>57 ) return 0; 001766 001767 /* Remove the slot from the free-list. Update the number of 001768 ** fragmented bytes within the page. */ 001769 memcpy(&aData[iAddr], &aData[pc], 2); 001770 aData[hdr+7] += (u8)x; 001771 return &aData[pc]; 001772 }else if( x+pc > maxPC ){ 001773 /* This slot extends off the end of the usable part of the page */ 001774 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001775 return 0; 001776 }else{ 001777 /* The slot remains on the free-list. Reduce its size to account 001778 ** for the portion used by the new allocation. */ 001779 put2byte(&aData[pc+2], x); 001780 } 001781 return &aData[pc + x]; 001782 } 001783 iAddr = pc; 001784 pTmp = &aData[pc]; 001785 pc = get2byte(pTmp); 001786 if( pc<=iAddr ){ 001787 if( pc ){ 001788 /* The next slot in the chain comes before the current slot */ 001789 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001790 } 001791 return 0; 001792 } 001793 } 001794 if( pc>maxPC+nByte-4 ){ 001795 /* The free slot chain extends off the end of the page */ 001796 *pRc = SQLITE_CORRUPT_PAGE(pPg); 001797 } 001798 return 0; 001799 } 001800 001801 /* 001802 ** Allocate nByte bytes of space from within the B-Tree page passed 001803 ** as the first argument. Write into *pIdx the index into pPage->aData[] 001804 ** of the first byte of allocated space. Return either SQLITE_OK or 001805 ** an error code (usually SQLITE_CORRUPT). 001806 ** 001807 ** The caller guarantees that there is sufficient space to make the 001808 ** allocation. This routine might need to defragment in order to bring 001809 ** all the space together, however. This routine will avoid using 001810 ** the first two bytes past the cell pointer area since presumably this 001811 ** allocation is being made in order to insert a new cell, so we will 001812 ** also end up needing a new cell pointer. 001813 */ 001814 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ 001815 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */ 001816 u8 * const data = pPage->aData; /* Local cache of pPage->aData */ 001817 int top; /* First byte of cell content area */ 001818 int rc = SQLITE_OK; /* Integer return code */ 001819 u8 *pTmp; /* Temp ptr into data[] */ 001820 int gap; /* First byte of gap between cell pointers and cell content */ 001821 001822 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001823 assert( pPage->pBt ); 001824 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001825 assert( nByte>=0 ); /* Minimum cell size is 4 */ 001826 assert( pPage->nFree>=nByte ); 001827 assert( pPage->nOverflow==0 ); 001828 assert( nByte < (int)(pPage->pBt->usableSize-8) ); 001829 001830 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); 001831 gap = pPage->cellOffset + 2*pPage->nCell; 001832 assert( gap<=65536 ); 001833 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size 001834 ** and the reserved space is zero (the usual value for reserved space) 001835 ** then the cell content offset of an empty page wants to be 65536. 001836 ** However, that integer is too large to be stored in a 2-byte unsigned 001837 ** integer, so a value of 0 is used in its place. */ 001838 pTmp = &data[hdr+5]; 001839 top = get2byte(pTmp); 001840 if( gap>top ){ 001841 if( top==0 && pPage->pBt->usableSize==65536 ){ 001842 top = 65536; 001843 }else{ 001844 return SQLITE_CORRUPT_PAGE(pPage); 001845 } 001846 }else if( top>(int)pPage->pBt->usableSize ){ 001847 return SQLITE_CORRUPT_PAGE(pPage); 001848 } 001849 001850 /* If there is enough space between gap and top for one more cell pointer, 001851 ** and if the freelist is not empty, then search the 001852 ** freelist looking for a slot big enough to satisfy the request. 001853 */ 001854 testcase( gap+2==top ); 001855 testcase( gap+1==top ); 001856 testcase( gap==top ); 001857 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){ 001858 u8 *pSpace = pageFindSlot(pPage, nByte, &rc); 001859 if( pSpace ){ 001860 int g2; 001861 assert( pSpace+nByte<=data+pPage->pBt->usableSize ); 001862 *pIdx = g2 = (int)(pSpace-data); 001863 if( g2<=gap ){ 001864 return SQLITE_CORRUPT_PAGE(pPage); 001865 }else{ 001866 return SQLITE_OK; 001867 } 001868 }else if( rc ){ 001869 return rc; 001870 } 001871 } 001872 001873 /* The request could not be fulfilled using a freelist slot. Check 001874 ** to see if defragmentation is necessary. 001875 */ 001876 testcase( gap+2+nByte==top ); 001877 if( gap+2+nByte>top ){ 001878 assert( pPage->nCell>0 || CORRUPT_DB ); 001879 assert( pPage->nFree>=0 ); 001880 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte))); 001881 if( rc ) return rc; 001882 top = get2byteNotZero(&data[hdr+5]); 001883 assert( gap+2+nByte<=top ); 001884 } 001885 001886 001887 /* Allocate memory from the gap in between the cell pointer array 001888 ** and the cell content area. The btreeComputeFreeSpace() call has already 001889 ** validated the freelist. Given that the freelist is valid, there 001890 ** is no way that the allocation can extend off the end of the page. 001891 ** The assert() below verifies the previous sentence. 001892 */ 001893 top -= nByte; 001894 put2byte(&data[hdr+5], top); 001895 assert( top+nByte <= (int)pPage->pBt->usableSize ); 001896 *pIdx = top; 001897 return SQLITE_OK; 001898 } 001899 001900 /* 001901 ** Return a section of the pPage->aData to the freelist. 001902 ** The first byte of the new free block is pPage->aData[iStart] 001903 ** and the size of the block is iSize bytes. 001904 ** 001905 ** Adjacent freeblocks are coalesced. 001906 ** 001907 ** Even though the freeblock list was checked by btreeComputeFreeSpace(), 001908 ** that routine will not detect overlap between cells or freeblocks. Nor 001909 ** does it detect cells or freeblocks that encroach into the reserved bytes 001910 ** at the end of the page. So do additional corruption checks inside this 001911 ** routine and return SQLITE_CORRUPT if any problems are found. 001912 */ 001913 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ 001914 u16 iPtr; /* Address of ptr to next freeblock */ 001915 u16 iFreeBlk; /* Address of the next freeblock */ 001916 u8 hdr; /* Page header size. 0 or 100 */ 001917 u8 nFrag = 0; /* Reduction in fragmentation */ 001918 u16 iOrigSize = iSize; /* Original value of iSize */ 001919 u16 x; /* Offset to cell content area */ 001920 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ 001921 unsigned char *data = pPage->aData; /* Page content */ 001922 u8 *pTmp; /* Temporary ptr into data[] */ 001923 001924 assert( pPage->pBt!=0 ); 001925 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 001926 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize ); 001927 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize ); 001928 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 001929 assert( iSize>=4 ); /* Minimum cell size is 4 */ 001930 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 ); 001931 001932 /* The list of freeblocks must be in ascending order. Find the 001933 ** spot on the list where iStart should be inserted. 001934 */ 001935 hdr = pPage->hdrOffset; 001936 iPtr = hdr + 1; 001937 if( data[iPtr+1]==0 && data[iPtr]==0 ){ 001938 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ 001939 }else{ 001940 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){ 001941 if( iFreeBlk<=iPtr ){ 001942 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */ 001943 return SQLITE_CORRUPT_PAGE(pPage); 001944 } 001945 iPtr = iFreeBlk; 001946 } 001947 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */ 001948 return SQLITE_CORRUPT_PAGE(pPage); 001949 } 001950 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB ); 001951 001952 /* At this point: 001953 ** iFreeBlk: First freeblock after iStart, or zero if none 001954 ** iPtr: The address of a pointer to iFreeBlk 001955 ** 001956 ** Check to see if iFreeBlk should be coalesced onto the end of iStart. 001957 */ 001958 if( iFreeBlk && iEnd+3>=iFreeBlk ){ 001959 nFrag = iFreeBlk - iEnd; 001960 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage); 001961 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]); 001962 if( iEnd > pPage->pBt->usableSize ){ 001963 return SQLITE_CORRUPT_PAGE(pPage); 001964 } 001965 iSize = iEnd - iStart; 001966 iFreeBlk = get2byte(&data[iFreeBlk]); 001967 } 001968 001969 /* If iPtr is another freeblock (that is, if iPtr is not the freelist 001970 ** pointer in the page header) then check to see if iStart should be 001971 ** coalesced onto the end of iPtr. 001972 */ 001973 if( iPtr>hdr+1 ){ 001974 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]); 001975 if( iPtrEnd+3>=iStart ){ 001976 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage); 001977 nFrag += iStart - iPtrEnd; 001978 iSize = iEnd - iPtr; 001979 iStart = iPtr; 001980 } 001981 } 001982 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); 001983 data[hdr+7] -= nFrag; 001984 } 001985 pTmp = &data[hdr+5]; 001986 x = get2byte(pTmp); 001987 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){ 001988 /* Overwrite deleted information with zeros when the secure_delete 001989 ** option is enabled */ 001990 memset(&data[iStart], 0, iSize); 001991 } 001992 if( iStart<=x ){ 001993 /* The new freeblock is at the beginning of the cell content area, 001994 ** so just extend the cell content area rather than create another 001995 ** freelist entry */ 001996 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage); 001997 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage); 001998 put2byte(&data[hdr+1], iFreeBlk); 001999 put2byte(&data[hdr+5], iEnd); 002000 }else{ 002001 /* Insert the new freeblock into the freelist */ 002002 put2byte(&data[iPtr], iStart); 002003 put2byte(&data[iStart], iFreeBlk); 002004 put2byte(&data[iStart+2], iSize); 002005 } 002006 pPage->nFree += iOrigSize; 002007 return SQLITE_OK; 002008 } 002009 002010 /* 002011 ** Decode the flags byte (the first byte of the header) for a page 002012 ** and initialize fields of the MemPage structure accordingly. 002013 ** 002014 ** Only the following combinations are supported. Anything different 002015 ** indicates a corrupt database files: 002016 ** 002017 ** PTF_ZERODATA (0x02, 2) 002018 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5) 002019 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10) 002020 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13) 002021 */ 002022 static int decodeFlags(MemPage *pPage, int flagByte){ 002023 BtShared *pBt; /* A copy of pPage->pBt */ 002024 002025 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); 002026 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002027 pBt = pPage->pBt; 002028 pPage->max1bytePayload = pBt->max1bytePayload; 002029 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){ 002030 pPage->childPtrSize = 0; 002031 pPage->leaf = 1; 002032 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){ 002033 pPage->intKeyLeaf = 1; 002034 pPage->xCellSize = cellSizePtrTableLeaf; 002035 pPage->xParseCell = btreeParseCellPtr; 002036 pPage->intKey = 1; 002037 pPage->maxLocal = pBt->maxLeaf; 002038 pPage->minLocal = pBt->minLeaf; 002039 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){ 002040 pPage->intKey = 0; 002041 pPage->intKeyLeaf = 0; 002042 pPage->xCellSize = cellSizePtrIdxLeaf; 002043 pPage->xParseCell = btreeParseCellPtrIndex; 002044 pPage->maxLocal = pBt->maxLocal; 002045 pPage->minLocal = pBt->minLocal; 002046 }else{ 002047 pPage->intKey = 0; 002048 pPage->intKeyLeaf = 0; 002049 pPage->xCellSize = cellSizePtrIdxLeaf; 002050 pPage->xParseCell = btreeParseCellPtrIndex; 002051 return SQLITE_CORRUPT_PAGE(pPage); 002052 } 002053 }else{ 002054 pPage->childPtrSize = 4; 002055 pPage->leaf = 0; 002056 if( flagByte==(PTF_ZERODATA) ){ 002057 pPage->intKey = 0; 002058 pPage->intKeyLeaf = 0; 002059 pPage->xCellSize = cellSizePtr; 002060 pPage->xParseCell = btreeParseCellPtrIndex; 002061 pPage->maxLocal = pBt->maxLocal; 002062 pPage->minLocal = pBt->minLocal; 002063 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ 002064 pPage->intKeyLeaf = 0; 002065 pPage->xCellSize = cellSizePtrNoPayload; 002066 pPage->xParseCell = btreeParseCellPtrNoPayload; 002067 pPage->intKey = 1; 002068 pPage->maxLocal = pBt->maxLeaf; 002069 pPage->minLocal = pBt->minLeaf; 002070 }else{ 002071 pPage->intKey = 0; 002072 pPage->intKeyLeaf = 0; 002073 pPage->xCellSize = cellSizePtr; 002074 pPage->xParseCell = btreeParseCellPtrIndex; 002075 return SQLITE_CORRUPT_PAGE(pPage); 002076 } 002077 } 002078 return SQLITE_OK; 002079 } 002080 002081 /* 002082 ** Compute the amount of freespace on the page. In other words, fill 002083 ** in the pPage->nFree field. 002084 */ 002085 static int btreeComputeFreeSpace(MemPage *pPage){ 002086 int pc; /* Address of a freeblock within pPage->aData[] */ 002087 u8 hdr; /* Offset to beginning of page header */ 002088 u8 *data; /* Equal to pPage->aData */ 002089 int usableSize; /* Amount of usable space on each page */ 002090 int nFree; /* Number of unused bytes on the page */ 002091 int top; /* First byte of the cell content area */ 002092 int iCellFirst; /* First allowable cell or freeblock offset */ 002093 int iCellLast; /* Last possible cell or freeblock offset */ 002094 002095 assert( pPage->pBt!=0 ); 002096 assert( pPage->pBt->db!=0 ); 002097 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002098 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002099 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002100 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002101 assert( pPage->isInit==1 ); 002102 assert( pPage->nFree<0 ); 002103 002104 usableSize = pPage->pBt->usableSize; 002105 hdr = pPage->hdrOffset; 002106 data = pPage->aData; 002107 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates 002108 ** the start of the cell content area. A zero value for this integer is 002109 ** interpreted as 65536. */ 002110 top = get2byteNotZero(&data[hdr+5]); 002111 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell; 002112 iCellLast = usableSize - 4; 002113 002114 /* Compute the total free space on the page 002115 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the 002116 ** start of the first freeblock on the page, or is zero if there are no 002117 ** freeblocks. */ 002118 pc = get2byte(&data[hdr+1]); 002119 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */ 002120 if( pc>0 ){ 002121 u32 next, size; 002122 if( pc<top ){ 002123 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will 002124 ** always be at least one cell before the first freeblock. 002125 */ 002126 return SQLITE_CORRUPT_PAGE(pPage); 002127 } 002128 while( 1 ){ 002129 if( pc>iCellLast ){ 002130 /* Freeblock off the end of the page */ 002131 return SQLITE_CORRUPT_PAGE(pPage); 002132 } 002133 next = get2byte(&data[pc]); 002134 size = get2byte(&data[pc+2]); 002135 nFree = nFree + size; 002136 if( next<=pc+size+3 ) break; 002137 pc = next; 002138 } 002139 if( next>0 ){ 002140 /* Freeblock not in ascending order */ 002141 return SQLITE_CORRUPT_PAGE(pPage); 002142 } 002143 if( pc+size>(unsigned int)usableSize ){ 002144 /* Last freeblock extends past page end */ 002145 return SQLITE_CORRUPT_PAGE(pPage); 002146 } 002147 } 002148 002149 /* At this point, nFree contains the sum of the offset to the start 002150 ** of the cell-content area plus the number of free bytes within 002151 ** the cell-content area. If this is greater than the usable-size 002152 ** of the page, then the page must be corrupted. This check also 002153 ** serves to verify that the offset to the start of the cell-content 002154 ** area, according to the page header, lies within the page. 002155 */ 002156 if( nFree>usableSize || nFree<iCellFirst ){ 002157 return SQLITE_CORRUPT_PAGE(pPage); 002158 } 002159 pPage->nFree = (u16)(nFree - iCellFirst); 002160 return SQLITE_OK; 002161 } 002162 002163 /* 002164 ** Do additional sanity check after btreeInitPage() if 002165 ** PRAGMA cell_size_check=ON 002166 */ 002167 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){ 002168 int iCellFirst; /* First allowable cell or freeblock offset */ 002169 int iCellLast; /* Last possible cell or freeblock offset */ 002170 int i; /* Index into the cell pointer array */ 002171 int sz; /* Size of a cell */ 002172 int pc; /* Address of a freeblock within pPage->aData[] */ 002173 u8 *data; /* Equal to pPage->aData */ 002174 int usableSize; /* Maximum usable space on the page */ 002175 int cellOffset; /* Start of cell content area */ 002176 002177 iCellFirst = pPage->cellOffset + 2*pPage->nCell; 002178 usableSize = pPage->pBt->usableSize; 002179 iCellLast = usableSize - 4; 002180 data = pPage->aData; 002181 cellOffset = pPage->cellOffset; 002182 if( !pPage->leaf ) iCellLast--; 002183 for(i=0; i<pPage->nCell; i++){ 002184 pc = get2byteAligned(&data[cellOffset+i*2]); 002185 testcase( pc==iCellFirst ); 002186 testcase( pc==iCellLast ); 002187 if( pc<iCellFirst || pc>iCellLast ){ 002188 return SQLITE_CORRUPT_PAGE(pPage); 002189 } 002190 sz = pPage->xCellSize(pPage, &data[pc]); 002191 testcase( pc+sz==usableSize ); 002192 if( pc+sz>usableSize ){ 002193 return SQLITE_CORRUPT_PAGE(pPage); 002194 } 002195 } 002196 return SQLITE_OK; 002197 } 002198 002199 /* 002200 ** Initialize the auxiliary information for a disk block. 002201 ** 002202 ** Return SQLITE_OK on success. If we see that the page does 002203 ** not contain a well-formed database page, then return 002204 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 002205 ** guarantee that the page is well-formed. It only shows that 002206 ** we failed to detect any corruption. 002207 */ 002208 static int btreeInitPage(MemPage *pPage){ 002209 u8 *data; /* Equal to pPage->aData */ 002210 BtShared *pBt; /* The main btree structure */ 002211 002212 assert( pPage->pBt!=0 ); 002213 assert( pPage->pBt->db!=0 ); 002214 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002215 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); 002216 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); 002217 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); 002218 assert( pPage->isInit==0 ); 002219 002220 pBt = pPage->pBt; 002221 data = pPage->aData + pPage->hdrOffset; 002222 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating 002223 ** the b-tree page type. */ 002224 if( decodeFlags(pPage, data[0]) ){ 002225 return SQLITE_CORRUPT_PAGE(pPage); 002226 } 002227 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002228 pPage->maskPage = (u16)(pBt->pageSize - 1); 002229 pPage->nOverflow = 0; 002230 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize; 002231 pPage->aCellIdx = data + pPage->childPtrSize + 8; 002232 pPage->aDataEnd = pPage->aData + pBt->pageSize; 002233 pPage->aDataOfst = pPage->aData + pPage->childPtrSize; 002234 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 002235 ** number of cells on the page. */ 002236 pPage->nCell = get2byte(&data[3]); 002237 if( pPage->nCell>MX_CELL(pBt) ){ 002238 /* To many cells for a single page. The page must be corrupt */ 002239 return SQLITE_CORRUPT_PAGE(pPage); 002240 } 002241 testcase( pPage->nCell==MX_CELL(pBt) ); 002242 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only 002243 ** possible for a root page of a table that contains no rows) then the 002244 ** offset to the cell content area will equal the page size minus the 002245 ** bytes of reserved space. */ 002246 assert( pPage->nCell>0 002247 || get2byteNotZero(&data[5])==(int)pBt->usableSize 002248 || CORRUPT_DB ); 002249 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */ 002250 pPage->isInit = 1; 002251 if( pBt->db->flags & SQLITE_CellSizeCk ){ 002252 return btreeCellSizeCheck(pPage); 002253 } 002254 return SQLITE_OK; 002255 } 002256 002257 /* 002258 ** Set up a raw page so that it looks like a database page holding 002259 ** no entries. 002260 */ 002261 static void zeroPage(MemPage *pPage, int flags){ 002262 unsigned char *data = pPage->aData; 002263 BtShared *pBt = pPage->pBt; 002264 u8 hdr = pPage->hdrOffset; 002265 u16 first; 002266 002267 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB ); 002268 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002269 assert( sqlite3PagerGetData(pPage->pDbPage) == data ); 002270 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 002271 assert( sqlite3_mutex_held(pBt->mutex) ); 002272 if( pBt->btsFlags & BTS_FAST_SECURE ){ 002273 memset(&data[hdr], 0, pBt->usableSize - hdr); 002274 } 002275 data[hdr] = (char)flags; 002276 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8); 002277 memset(&data[hdr+1], 0, 4); 002278 data[hdr+7] = 0; 002279 put2byte(&data[hdr+5], pBt->usableSize); 002280 pPage->nFree = (u16)(pBt->usableSize - first); 002281 decodeFlags(pPage, flags); 002282 pPage->cellOffset = first; 002283 pPage->aDataEnd = &data[pBt->pageSize]; 002284 pPage->aCellIdx = &data[first]; 002285 pPage->aDataOfst = &data[pPage->childPtrSize]; 002286 pPage->nOverflow = 0; 002287 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 ); 002288 pPage->maskPage = (u16)(pBt->pageSize - 1); 002289 pPage->nCell = 0; 002290 pPage->isInit = 1; 002291 } 002292 002293 002294 /* 002295 ** Convert a DbPage obtained from the pager into a MemPage used by 002296 ** the btree layer. 002297 */ 002298 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ 002299 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002300 if( pgno!=pPage->pgno ){ 002301 pPage->aData = sqlite3PagerGetData(pDbPage); 002302 pPage->pDbPage = pDbPage; 002303 pPage->pBt = pBt; 002304 pPage->pgno = pgno; 002305 pPage->hdrOffset = pgno==1 ? 100 : 0; 002306 } 002307 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002308 return pPage; 002309 } 002310 002311 /* 002312 ** Get a page from the pager. Initialize the MemPage.pBt and 002313 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage(). 002314 ** 002315 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care 002316 ** about the content of the page at this time. So do not go to the disk 002317 ** to fetch the content. Just fill in the content with zeros for now. 002318 ** If in the future we call sqlite3PagerWrite() on this page, that 002319 ** means we have started to be concerned about content and the disk 002320 ** read should occur at that point. 002321 */ 002322 static int btreeGetPage( 002323 BtShared *pBt, /* The btree */ 002324 Pgno pgno, /* Number of the page to fetch */ 002325 MemPage **ppPage, /* Return the page in this parameter */ 002326 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002327 ){ 002328 int rc; 002329 DbPage *pDbPage; 002330 002331 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY ); 002332 assert( sqlite3_mutex_held(pBt->mutex) ); 002333 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags); 002334 if( rc ) return rc; 002335 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); 002336 return SQLITE_OK; 002337 } 002338 002339 /* 002340 ** Retrieve a page from the pager cache. If the requested page is not 002341 ** already in the pager cache return NULL. Initialize the MemPage.pBt and 002342 ** MemPage.aData elements if needed. 002343 */ 002344 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ 002345 DbPage *pDbPage; 002346 assert( sqlite3_mutex_held(pBt->mutex) ); 002347 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); 002348 if( pDbPage ){ 002349 return btreePageFromDbPage(pDbPage, pgno, pBt); 002350 } 002351 return 0; 002352 } 002353 002354 /* 002355 ** Return the size of the database file in pages. If there is any kind of 002356 ** error, return ((unsigned int)-1). 002357 */ 002358 static Pgno btreePagecount(BtShared *pBt){ 002359 return pBt->nPage; 002360 } 002361 Pgno sqlite3BtreeLastPage(Btree *p){ 002362 assert( sqlite3BtreeHoldsMutex(p) ); 002363 return btreePagecount(p->pBt); 002364 } 002365 002366 /* 002367 ** Get a page from the pager and initialize it. 002368 */ 002369 static int getAndInitPage( 002370 BtShared *pBt, /* The database file */ 002371 Pgno pgno, /* Number of the page to get */ 002372 MemPage **ppPage, /* Write the page pointer here */ 002373 int bReadOnly /* True for a read-only page */ 002374 ){ 002375 int rc; 002376 DbPage *pDbPage; 002377 MemPage *pPage; 002378 assert( sqlite3_mutex_held(pBt->mutex) ); 002379 002380 if( pgno>btreePagecount(pBt) ){ 002381 *ppPage = 0; 002382 return SQLITE_CORRUPT_BKPT; 002383 } 002384 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly); 002385 if( rc ){ 002386 *ppPage = 0; 002387 return rc; 002388 } 002389 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); 002390 if( pPage->isInit==0 ){ 002391 btreePageFromDbPage(pDbPage, pgno, pBt); 002392 rc = btreeInitPage(pPage); 002393 if( rc!=SQLITE_OK ){ 002394 releasePage(pPage); 002395 *ppPage = 0; 002396 return rc; 002397 } 002398 } 002399 assert( pPage->pgno==pgno || CORRUPT_DB ); 002400 assert( pPage->aData==sqlite3PagerGetData(pDbPage) ); 002401 *ppPage = pPage; 002402 return SQLITE_OK; 002403 } 002404 002405 /* 002406 ** Release a MemPage. This should be called once for each prior 002407 ** call to btreeGetPage. 002408 ** 002409 ** Page1 is a special case and must be released using releasePageOne(). 002410 */ 002411 static void releasePageNotNull(MemPage *pPage){ 002412 assert( pPage->aData ); 002413 assert( pPage->pBt ); 002414 assert( pPage->pDbPage!=0 ); 002415 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002416 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002417 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002418 sqlite3PagerUnrefNotNull(pPage->pDbPage); 002419 } 002420 static void releasePage(MemPage *pPage){ 002421 if( pPage ) releasePageNotNull(pPage); 002422 } 002423 static void releasePageOne(MemPage *pPage){ 002424 assert( pPage!=0 ); 002425 assert( pPage->aData ); 002426 assert( pPage->pBt ); 002427 assert( pPage->pDbPage!=0 ); 002428 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); 002429 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); 002430 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002431 sqlite3PagerUnrefPageOne(pPage->pDbPage); 002432 } 002433 002434 /* 002435 ** Get an unused page. 002436 ** 002437 ** This works just like btreeGetPage() with the addition: 002438 ** 002439 ** * If the page is already in use for some other purpose, immediately 002440 ** release it and return an SQLITE_CURRUPT error. 002441 ** * Make sure the isInit flag is clear 002442 */ 002443 static int btreeGetUnusedPage( 002444 BtShared *pBt, /* The btree */ 002445 Pgno pgno, /* Number of the page to fetch */ 002446 MemPage **ppPage, /* Return the page in this parameter */ 002447 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */ 002448 ){ 002449 int rc = btreeGetPage(pBt, pgno, ppPage, flags); 002450 if( rc==SQLITE_OK ){ 002451 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 002452 releasePage(*ppPage); 002453 *ppPage = 0; 002454 return SQLITE_CORRUPT_BKPT; 002455 } 002456 (*ppPage)->isInit = 0; 002457 }else{ 002458 *ppPage = 0; 002459 } 002460 return rc; 002461 } 002462 002463 002464 /* 002465 ** During a rollback, when the pager reloads information into the cache 002466 ** so that the cache is restored to its original state at the start of 002467 ** the transaction, for each page restored this routine is called. 002468 ** 002469 ** This routine needs to reset the extra data section at the end of the 002470 ** page to agree with the restored data. 002471 */ 002472 static void pageReinit(DbPage *pData){ 002473 MemPage *pPage; 002474 pPage = (MemPage *)sqlite3PagerGetExtra(pData); 002475 assert( sqlite3PagerPageRefcount(pData)>0 ); 002476 if( pPage->isInit ){ 002477 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 002478 pPage->isInit = 0; 002479 if( sqlite3PagerPageRefcount(pData)>1 ){ 002480 /* pPage might not be a btree page; it might be an overflow page 002481 ** or ptrmap page or a free page. In those cases, the following 002482 ** call to btreeInitPage() will likely return SQLITE_CORRUPT. 002483 ** But no harm is done by this. And it is very important that 002484 ** btreeInitPage() be called on every btree page so we make 002485 ** the call for every page that comes in for re-initializing. */ 002486 btreeInitPage(pPage); 002487 } 002488 } 002489 } 002490 002491 /* 002492 ** Invoke the busy handler for a btree. 002493 */ 002494 static int btreeInvokeBusyHandler(void *pArg){ 002495 BtShared *pBt = (BtShared*)pArg; 002496 assert( pBt->db ); 002497 assert( sqlite3_mutex_held(pBt->db->mutex) ); 002498 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); 002499 } 002500 002501 /* 002502 ** Open a database file. 002503 ** 002504 ** zFilename is the name of the database file. If zFilename is NULL 002505 ** then an ephemeral database is created. The ephemeral database might 002506 ** be exclusively in memory, or it might use a disk-based memory cache. 002507 ** Either way, the ephemeral database will be automatically deleted 002508 ** when sqlite3BtreeClose() is called. 002509 ** 002510 ** If zFilename is ":memory:" then an in-memory database is created 002511 ** that is automatically destroyed when it is closed. 002512 ** 002513 ** The "flags" parameter is a bitmask that might contain bits like 002514 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY. 002515 ** 002516 ** If the database is already opened in the same database connection 002517 ** and we are in shared cache mode, then the open will fail with an 002518 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared 002519 ** objects in the same database connection since doing so will lead 002520 ** to problems with locking. 002521 */ 002522 int sqlite3BtreeOpen( 002523 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */ 002524 const char *zFilename, /* Name of the file containing the BTree database */ 002525 sqlite3 *db, /* Associated database handle */ 002526 Btree **ppBtree, /* Pointer to new Btree object written here */ 002527 int flags, /* Options */ 002528 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */ 002529 ){ 002530 BtShared *pBt = 0; /* Shared part of btree structure */ 002531 Btree *p; /* Handle to return */ 002532 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */ 002533 int rc = SQLITE_OK; /* Result code from this function */ 002534 u8 nReserve; /* Byte of unused space on each page */ 002535 unsigned char zDbHeader[100]; /* Database header content */ 002536 002537 /* True if opening an ephemeral, temporary database */ 002538 const int isTempDb = zFilename==0 || zFilename[0]==0; 002539 002540 /* Set the variable isMemdb to true for an in-memory database, or 002541 ** false for a file-based database. 002542 */ 002543 #ifdef SQLITE_OMIT_MEMORYDB 002544 const int isMemdb = 0; 002545 #else 002546 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0) 002547 || (isTempDb && sqlite3TempInMemory(db)) 002548 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0; 002549 #endif 002550 002551 assert( db!=0 ); 002552 assert( pVfs!=0 ); 002553 assert( sqlite3_mutex_held(db->mutex) ); 002554 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */ 002555 002556 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */ 002557 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 ); 002558 002559 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */ 002560 assert( (flags & BTREE_SINGLE)==0 || isTempDb ); 002561 002562 if( isMemdb ){ 002563 flags |= BTREE_MEMORY; 002564 } 002565 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){ 002566 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB; 002567 } 002568 p = sqlite3MallocZero(sizeof(Btree)); 002569 if( !p ){ 002570 return SQLITE_NOMEM_BKPT; 002571 } 002572 p->inTrans = TRANS_NONE; 002573 p->db = db; 002574 #ifndef SQLITE_OMIT_SHARED_CACHE 002575 p->lock.pBtree = p; 002576 p->lock.iTable = 1; 002577 #endif 002578 002579 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002580 /* 002581 ** If this Btree is a candidate for shared cache, try to find an 002582 ** existing BtShared object that we can share with 002583 */ 002584 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){ 002585 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ 002586 int nFilename = sqlite3Strlen30(zFilename)+1; 002587 int nFullPathname = pVfs->mxPathname+1; 002588 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename)); 002589 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002590 002591 p->sharable = 1; 002592 if( !zFullPathname ){ 002593 sqlite3_free(p); 002594 return SQLITE_NOMEM_BKPT; 002595 } 002596 if( isMemdb ){ 002597 memcpy(zFullPathname, zFilename, nFilename); 002598 }else{ 002599 rc = sqlite3OsFullPathname(pVfs, zFilename, 002600 nFullPathname, zFullPathname); 002601 if( rc ){ 002602 if( rc==SQLITE_OK_SYMLINK ){ 002603 rc = SQLITE_OK; 002604 }else{ 002605 sqlite3_free(zFullPathname); 002606 sqlite3_free(p); 002607 return rc; 002608 } 002609 } 002610 } 002611 #if SQLITE_THREADSAFE 002612 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); 002613 sqlite3_mutex_enter(mutexOpen); 002614 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); 002615 sqlite3_mutex_enter(mutexShared); 002616 #endif 002617 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ 002618 assert( pBt->nRef>0 ); 002619 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0)) 002620 && sqlite3PagerVfs(pBt->pPager)==pVfs ){ 002621 int iDb; 002622 for(iDb=db->nDb-1; iDb>=0; iDb--){ 002623 Btree *pExisting = db->aDb[iDb].pBt; 002624 if( pExisting && pExisting->pBt==pBt ){ 002625 sqlite3_mutex_leave(mutexShared); 002626 sqlite3_mutex_leave(mutexOpen); 002627 sqlite3_free(zFullPathname); 002628 sqlite3_free(p); 002629 return SQLITE_CONSTRAINT; 002630 } 002631 } 002632 p->pBt = pBt; 002633 pBt->nRef++; 002634 break; 002635 } 002636 } 002637 sqlite3_mutex_leave(mutexShared); 002638 sqlite3_free(zFullPathname); 002639 } 002640 #ifdef SQLITE_DEBUG 002641 else{ 002642 /* In debug mode, we mark all persistent databases as sharable 002643 ** even when they are not. This exercises the locking code and 002644 ** gives more opportunity for asserts(sqlite3_mutex_held()) 002645 ** statements to find locking problems. 002646 */ 002647 p->sharable = 1; 002648 } 002649 #endif 002650 } 002651 #endif 002652 if( pBt==0 ){ 002653 /* 002654 ** The following asserts make sure that structures used by the btree are 002655 ** the right size. This is to guard against size changes that result 002656 ** when compiling on a different architecture. 002657 */ 002658 assert( sizeof(i64)==8 ); 002659 assert( sizeof(u64)==8 ); 002660 assert( sizeof(u32)==4 ); 002661 assert( sizeof(u16)==2 ); 002662 assert( sizeof(Pgno)==4 ); 002663 002664 /* Suppress false-positive compiler warning from PVS-Studio */ 002665 memset(&zDbHeader[16], 0, 8); 002666 002667 pBt = sqlite3MallocZero( sizeof(*pBt) ); 002668 if( pBt==0 ){ 002669 rc = SQLITE_NOMEM_BKPT; 002670 goto btree_open_out; 002671 } 002672 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, 002673 sizeof(MemPage), flags, vfsFlags, pageReinit); 002674 if( rc==SQLITE_OK ){ 002675 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap); 002676 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); 002677 } 002678 if( rc!=SQLITE_OK ){ 002679 goto btree_open_out; 002680 } 002681 pBt->openFlags = (u8)flags; 002682 pBt->db = db; 002683 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt); 002684 p->pBt = pBt; 002685 002686 pBt->pCursor = 0; 002687 pBt->pPage1 = 0; 002688 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY; 002689 #if defined(SQLITE_SECURE_DELETE) 002690 pBt->btsFlags |= BTS_SECURE_DELETE; 002691 #elif defined(SQLITE_FAST_SECURE_DELETE) 002692 pBt->btsFlags |= BTS_OVERWRITE; 002693 #endif 002694 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 002695 ** determined by the 2-byte integer located at an offset of 16 bytes from 002696 ** the beginning of the database file. */ 002697 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16); 002698 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE 002699 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ 002700 pBt->pageSize = 0; 002701 #ifndef SQLITE_OMIT_AUTOVACUUM 002702 /* If the magic name ":memory:" will create an in-memory database, then 002703 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if 002704 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if 002705 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a 002706 ** regular file-name. In this case the auto-vacuum applies as per normal. 002707 */ 002708 if( zFilename && !isMemdb ){ 002709 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); 002710 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); 002711 } 002712 #endif 002713 nReserve = 0; 002714 }else{ 002715 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is 002716 ** determined by the one-byte unsigned integer found at an offset of 20 002717 ** into the database file header. */ 002718 nReserve = zDbHeader[20]; 002719 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 002720 #ifndef SQLITE_OMIT_AUTOVACUUM 002721 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); 002722 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); 002723 #endif 002724 } 002725 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 002726 if( rc ) goto btree_open_out; 002727 pBt->usableSize = pBt->pageSize - nReserve; 002728 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */ 002729 002730 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002731 /* Add the new BtShared object to the linked list sharable BtShareds. 002732 */ 002733 pBt->nRef = 1; 002734 if( p->sharable ){ 002735 MUTEX_LOGIC( sqlite3_mutex *mutexShared; ) 002736 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);) 002737 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ 002738 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); 002739 if( pBt->mutex==0 ){ 002740 rc = SQLITE_NOMEM_BKPT; 002741 goto btree_open_out; 002742 } 002743 } 002744 sqlite3_mutex_enter(mutexShared); 002745 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); 002746 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; 002747 sqlite3_mutex_leave(mutexShared); 002748 } 002749 #endif 002750 } 002751 002752 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) 002753 /* If the new Btree uses a sharable pBtShared, then link the new 002754 ** Btree into the list of all sharable Btrees for the same connection. 002755 ** The list is kept in ascending order by pBt address. 002756 */ 002757 if( p->sharable ){ 002758 int i; 002759 Btree *pSib; 002760 for(i=0; i<db->nDb; i++){ 002761 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ 002762 while( pSib->pPrev ){ pSib = pSib->pPrev; } 002763 if( (uptr)p->pBt<(uptr)pSib->pBt ){ 002764 p->pNext = pSib; 002765 p->pPrev = 0; 002766 pSib->pPrev = p; 002767 }else{ 002768 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){ 002769 pSib = pSib->pNext; 002770 } 002771 p->pNext = pSib->pNext; 002772 p->pPrev = pSib; 002773 if( p->pNext ){ 002774 p->pNext->pPrev = p; 002775 } 002776 pSib->pNext = p; 002777 } 002778 break; 002779 } 002780 } 002781 } 002782 #endif 002783 *ppBtree = p; 002784 002785 btree_open_out: 002786 if( rc!=SQLITE_OK ){ 002787 if( pBt && pBt->pPager ){ 002788 sqlite3PagerClose(pBt->pPager, 0); 002789 } 002790 sqlite3_free(pBt); 002791 sqlite3_free(p); 002792 *ppBtree = 0; 002793 }else{ 002794 sqlite3_file *pFile; 002795 002796 /* If the B-Tree was successfully opened, set the pager-cache size to the 002797 ** default value. Except, when opening on an existing shared pager-cache, 002798 ** do not change the pager-cache size. 002799 */ 002800 if( sqlite3BtreeSchema(p, 0, 0)==0 ){ 002801 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE); 002802 } 002803 002804 pFile = sqlite3PagerFile(pBt->pPager); 002805 if( pFile->pMethods ){ 002806 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db); 002807 } 002808 } 002809 if( mutexOpen ){ 002810 assert( sqlite3_mutex_held(mutexOpen) ); 002811 sqlite3_mutex_leave(mutexOpen); 002812 } 002813 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 ); 002814 return rc; 002815 } 002816 002817 /* 002818 ** Decrement the BtShared.nRef counter. When it reaches zero, 002819 ** remove the BtShared structure from the sharing list. Return 002820 ** true if the BtShared.nRef counter reaches zero and return 002821 ** false if it is still positive. 002822 */ 002823 static int removeFromSharingList(BtShared *pBt){ 002824 #ifndef SQLITE_OMIT_SHARED_CACHE 002825 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; ) 002826 BtShared *pList; 002827 int removed = 0; 002828 002829 assert( sqlite3_mutex_notheld(pBt->mutex) ); 002830 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); ) 002831 sqlite3_mutex_enter(pMainMtx); 002832 pBt->nRef--; 002833 if( pBt->nRef<=0 ){ 002834 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ 002835 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; 002836 }else{ 002837 pList = GLOBAL(BtShared*,sqlite3SharedCacheList); 002838 while( ALWAYS(pList) && pList->pNext!=pBt ){ 002839 pList=pList->pNext; 002840 } 002841 if( ALWAYS(pList) ){ 002842 pList->pNext = pBt->pNext; 002843 } 002844 } 002845 if( SQLITE_THREADSAFE ){ 002846 sqlite3_mutex_free(pBt->mutex); 002847 } 002848 removed = 1; 002849 } 002850 sqlite3_mutex_leave(pMainMtx); 002851 return removed; 002852 #else 002853 return 1; 002854 #endif 002855 } 002856 002857 /* 002858 ** Make sure pBt->pTmpSpace points to an allocation of 002859 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child 002860 ** pointer. 002861 */ 002862 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){ 002863 assert( pBt!=0 ); 002864 assert( pBt->pTmpSpace==0 ); 002865 /* This routine is called only by btreeCursor() when allocating the 002866 ** first write cursor for the BtShared object */ 002867 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 ); 002868 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); 002869 if( pBt->pTmpSpace==0 ){ 002870 BtCursor *pCur = pBt->pCursor; 002871 pBt->pCursor = pCur->pNext; /* Unlink the cursor */ 002872 memset(pCur, 0, sizeof(*pCur)); 002873 return SQLITE_NOMEM_BKPT; 002874 } 002875 002876 /* One of the uses of pBt->pTmpSpace is to format cells before 002877 ** inserting them into a leaf page (function fillInCell()). If 002878 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes 002879 ** by the various routines that manipulate binary cells. Which 002880 ** can mean that fillInCell() only initializes the first 2 or 3 002881 ** bytes of pTmpSpace, but that the first 4 bytes are copied from 002882 ** it into a database page. This is not actually a problem, but it 002883 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized 002884 ** data is passed to system call write(). So to avoid this error, 002885 ** zero the first 4 bytes of temp space here. 002886 ** 002887 ** Also: Provide four bytes of initialized space before the 002888 ** beginning of pTmpSpace as an area available to prepend the 002889 ** left-child pointer to the beginning of a cell. 002890 */ 002891 memset(pBt->pTmpSpace, 0, 8); 002892 pBt->pTmpSpace += 4; 002893 return SQLITE_OK; 002894 } 002895 002896 /* 002897 ** Free the pBt->pTmpSpace allocation 002898 */ 002899 static void freeTempSpace(BtShared *pBt){ 002900 if( pBt->pTmpSpace ){ 002901 pBt->pTmpSpace -= 4; 002902 sqlite3PageFree(pBt->pTmpSpace); 002903 pBt->pTmpSpace = 0; 002904 } 002905 } 002906 002907 /* 002908 ** Close an open database and invalidate all cursors. 002909 */ 002910 int sqlite3BtreeClose(Btree *p){ 002911 BtShared *pBt = p->pBt; 002912 002913 /* Close all cursors opened via this handle. */ 002914 assert( sqlite3_mutex_held(p->db->mutex) ); 002915 sqlite3BtreeEnter(p); 002916 002917 /* Verify that no other cursors have this Btree open */ 002918 #ifdef SQLITE_DEBUG 002919 { 002920 BtCursor *pCur = pBt->pCursor; 002921 while( pCur ){ 002922 BtCursor *pTmp = pCur; 002923 pCur = pCur->pNext; 002924 assert( pTmp->pBtree!=p ); 002925 002926 } 002927 } 002928 #endif 002929 002930 /* Rollback any active transaction and free the handle structure. 002931 ** The call to sqlite3BtreeRollback() drops any table-locks held by 002932 ** this handle. 002933 */ 002934 sqlite3BtreeRollback(p, SQLITE_OK, 0); 002935 sqlite3BtreeLeave(p); 002936 002937 /* If there are still other outstanding references to the shared-btree 002938 ** structure, return now. The remainder of this procedure cleans 002939 ** up the shared-btree. 002940 */ 002941 assert( p->wantToLock==0 && p->locked==0 ); 002942 if( !p->sharable || removeFromSharingList(pBt) ){ 002943 /* The pBt is no longer on the sharing list, so we can access 002944 ** it without having to hold the mutex. 002945 ** 002946 ** Clean out and delete the BtShared object. 002947 */ 002948 assert( !pBt->pCursor ); 002949 sqlite3PagerClose(pBt->pPager, p->db); 002950 if( pBt->xFreeSchema && pBt->pSchema ){ 002951 pBt->xFreeSchema(pBt->pSchema); 002952 } 002953 sqlite3DbFree(0, pBt->pSchema); 002954 freeTempSpace(pBt); 002955 sqlite3_free(pBt); 002956 } 002957 002958 #ifndef SQLITE_OMIT_SHARED_CACHE 002959 assert( p->wantToLock==0 ); 002960 assert( p->locked==0 ); 002961 if( p->pPrev ) p->pPrev->pNext = p->pNext; 002962 if( p->pNext ) p->pNext->pPrev = p->pPrev; 002963 #endif 002964 002965 sqlite3_free(p); 002966 return SQLITE_OK; 002967 } 002968 002969 /* 002970 ** Change the "soft" limit on the number of pages in the cache. 002971 ** Unused and unmodified pages will be recycled when the number of 002972 ** pages in the cache exceeds this soft limit. But the size of the 002973 ** cache is allowed to grow larger than this limit if it contains 002974 ** dirty pages or pages still in active use. 002975 */ 002976 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ 002977 BtShared *pBt = p->pBt; 002978 assert( sqlite3_mutex_held(p->db->mutex) ); 002979 sqlite3BtreeEnter(p); 002980 sqlite3PagerSetCachesize(pBt->pPager, mxPage); 002981 sqlite3BtreeLeave(p); 002982 return SQLITE_OK; 002983 } 002984 002985 /* 002986 ** Change the "spill" limit on the number of pages in the cache. 002987 ** If the number of pages exceeds this limit during a write transaction, 002988 ** the pager might attempt to "spill" pages to the journal early in 002989 ** order to free up memory. 002990 ** 002991 ** The value returned is the current spill size. If zero is passed 002992 ** as an argument, no changes are made to the spill size setting, so 002993 ** using mxPage of 0 is a way to query the current spill size. 002994 */ 002995 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){ 002996 BtShared *pBt = p->pBt; 002997 int res; 002998 assert( sqlite3_mutex_held(p->db->mutex) ); 002999 sqlite3BtreeEnter(p); 003000 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage); 003001 sqlite3BtreeLeave(p); 003002 return res; 003003 } 003004 003005 #if SQLITE_MAX_MMAP_SIZE>0 003006 /* 003007 ** Change the limit on the amount of the database file that may be 003008 ** memory mapped. 003009 */ 003010 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){ 003011 BtShared *pBt = p->pBt; 003012 assert( sqlite3_mutex_held(p->db->mutex) ); 003013 sqlite3BtreeEnter(p); 003014 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap); 003015 sqlite3BtreeLeave(p); 003016 return SQLITE_OK; 003017 } 003018 #endif /* SQLITE_MAX_MMAP_SIZE>0 */ 003019 003020 /* 003021 ** Change the way data is synced to disk in order to increase or decrease 003022 ** how well the database resists damage due to OS crashes and power 003023 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 003024 ** there is a high probability of damage) Level 2 is the default. There 003025 ** is a very low but non-zero probability of damage. Level 3 reduces the 003026 ** probability of damage to near zero but with a write performance reduction. 003027 */ 003028 #ifndef SQLITE_OMIT_PAGER_PRAGMAS 003029 int sqlite3BtreeSetPagerFlags( 003030 Btree *p, /* The btree to set the safety level on */ 003031 unsigned pgFlags /* Various PAGER_* flags */ 003032 ){ 003033 BtShared *pBt = p->pBt; 003034 assert( sqlite3_mutex_held(p->db->mutex) ); 003035 sqlite3BtreeEnter(p); 003036 sqlite3PagerSetFlags(pBt->pPager, pgFlags); 003037 sqlite3BtreeLeave(p); 003038 return SQLITE_OK; 003039 } 003040 #endif 003041 003042 /* 003043 ** Change the default pages size and the number of reserved bytes per page. 003044 ** Or, if the page size has already been fixed, return SQLITE_READONLY 003045 ** without changing anything. 003046 ** 003047 ** The page size must be a power of 2 between 512 and 65536. If the page 003048 ** size supplied does not meet this constraint then the page size is not 003049 ** changed. 003050 ** 003051 ** Page sizes are constrained to be a power of two so that the region 003052 ** of the database file used for locking (beginning at PENDING_BYTE, 003053 ** the first byte past the 1GB boundary, 0x40000000) needs to occur 003054 ** at the beginning of a page. 003055 ** 003056 ** If parameter nReserve is less than zero, then the number of reserved 003057 ** bytes per page is left unchanged. 003058 ** 003059 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size 003060 ** and autovacuum mode can no longer be changed. 003061 */ 003062 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ 003063 int rc = SQLITE_OK; 003064 int x; 003065 BtShared *pBt = p->pBt; 003066 assert( nReserve>=0 && nReserve<=255 ); 003067 sqlite3BtreeEnter(p); 003068 pBt->nReserveWanted = nReserve; 003069 x = pBt->pageSize - pBt->usableSize; 003070 if( nReserve<x ) nReserve = x; 003071 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){ 003072 sqlite3BtreeLeave(p); 003073 return SQLITE_READONLY; 003074 } 003075 assert( nReserve>=0 && nReserve<=255 ); 003076 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && 003077 ((pageSize-1)&pageSize)==0 ){ 003078 assert( (pageSize & 7)==0 ); 003079 assert( !pBt->pCursor ); 003080 if( nReserve>32 && pageSize==512 ) pageSize = 1024; 003081 pBt->pageSize = (u32)pageSize; 003082 freeTempSpace(pBt); 003083 } 003084 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); 003085 pBt->usableSize = pBt->pageSize - (u16)nReserve; 003086 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003087 sqlite3BtreeLeave(p); 003088 return rc; 003089 } 003090 003091 /* 003092 ** Return the currently defined page size 003093 */ 003094 int sqlite3BtreeGetPageSize(Btree *p){ 003095 return p->pBt->pageSize; 003096 } 003097 003098 /* 003099 ** This function is similar to sqlite3BtreeGetReserve(), except that it 003100 ** may only be called if it is guaranteed that the b-tree mutex is already 003101 ** held. 003102 ** 003103 ** This is useful in one special case in the backup API code where it is 003104 ** known that the shared b-tree mutex is held, but the mutex on the 003105 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter() 003106 ** were to be called, it might collide with some other operation on the 003107 ** database handle that owns *p, causing undefined behavior. 003108 */ 003109 int sqlite3BtreeGetReserveNoMutex(Btree *p){ 003110 int n; 003111 assert( sqlite3_mutex_held(p->pBt->mutex) ); 003112 n = p->pBt->pageSize - p->pBt->usableSize; 003113 return n; 003114 } 003115 003116 /* 003117 ** Return the number of bytes of space at the end of every page that 003118 ** are intentionally left unused. This is the "reserved" space that is 003119 ** sometimes used by extensions. 003120 ** 003121 ** The value returned is the larger of the current reserve size and 003122 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES. 003123 ** The amount of reserve can only grow - never shrink. 003124 */ 003125 int sqlite3BtreeGetRequestedReserve(Btree *p){ 003126 int n1, n2; 003127 sqlite3BtreeEnter(p); 003128 n1 = (int)p->pBt->nReserveWanted; 003129 n2 = sqlite3BtreeGetReserveNoMutex(p); 003130 sqlite3BtreeLeave(p); 003131 return n1>n2 ? n1 : n2; 003132 } 003133 003134 003135 /* 003136 ** Set the maximum page count for a database if mxPage is positive. 003137 ** No changes are made if mxPage is 0 or negative. 003138 ** Regardless of the value of mxPage, return the maximum page count. 003139 */ 003140 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){ 003141 Pgno n; 003142 sqlite3BtreeEnter(p); 003143 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); 003144 sqlite3BtreeLeave(p); 003145 return n; 003146 } 003147 003148 /* 003149 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags: 003150 ** 003151 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared 003152 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared 003153 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set 003154 ** newFlag==(-1) No changes 003155 ** 003156 ** This routine acts as a query if newFlag is less than zero 003157 ** 003158 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but 003159 ** freelist leaf pages are not written back to the database. Thus in-page 003160 ** deleted content is cleared, but freelist deleted content is not. 003161 ** 003162 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition 003163 ** that freelist leaf pages are written back into the database, increasing 003164 ** the amount of disk I/O. 003165 */ 003166 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){ 003167 int b; 003168 if( p==0 ) return 0; 003169 sqlite3BtreeEnter(p); 003170 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 ); 003171 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) ); 003172 if( newFlag>=0 ){ 003173 p->pBt->btsFlags &= ~BTS_FAST_SECURE; 003174 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag; 003175 } 003176 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE; 003177 sqlite3BtreeLeave(p); 003178 return b; 003179 } 003180 003181 /* 003182 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' 003183 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it 003184 ** is disabled. The default value for the auto-vacuum property is 003185 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. 003186 */ 003187 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ 003188 #ifdef SQLITE_OMIT_AUTOVACUUM 003189 return SQLITE_READONLY; 003190 #else 003191 BtShared *pBt = p->pBt; 003192 int rc = SQLITE_OK; 003193 u8 av = (u8)autoVacuum; 003194 003195 sqlite3BtreeEnter(p); 003196 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){ 003197 rc = SQLITE_READONLY; 003198 }else{ 003199 pBt->autoVacuum = av ?1:0; 003200 pBt->incrVacuum = av==2 ?1:0; 003201 } 003202 sqlite3BtreeLeave(p); 003203 return rc; 003204 #endif 003205 } 003206 003207 /* 003208 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 003209 ** enabled 1 is returned. Otherwise 0. 003210 */ 003211 int sqlite3BtreeGetAutoVacuum(Btree *p){ 003212 #ifdef SQLITE_OMIT_AUTOVACUUM 003213 return BTREE_AUTOVACUUM_NONE; 003214 #else 003215 int rc; 003216 sqlite3BtreeEnter(p); 003217 rc = ( 003218 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: 003219 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: 003220 BTREE_AUTOVACUUM_INCR 003221 ); 003222 sqlite3BtreeLeave(p); 003223 return rc; 003224 #endif 003225 } 003226 003227 /* 003228 ** If the user has not set the safety-level for this database connection 003229 ** using "PRAGMA synchronous", and if the safety-level is not already 003230 ** set to the value passed to this function as the second parameter, 003231 ** set it so. 003232 */ 003233 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \ 003234 && !defined(SQLITE_OMIT_WAL) 003235 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){ 003236 sqlite3 *db; 003237 Db *pDb; 003238 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){ 003239 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; } 003240 if( pDb->bSyncSet==0 003241 && pDb->safety_level!=safety_level 003242 && pDb!=&db->aDb[1] 003243 ){ 003244 pDb->safety_level = safety_level; 003245 sqlite3PagerSetFlags(pBt->pPager, 003246 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK)); 003247 } 003248 } 003249 } 003250 #else 003251 # define setDefaultSyncFlag(pBt,safety_level) 003252 #endif 003253 003254 /* Forward declaration */ 003255 static int newDatabase(BtShared*); 003256 003257 003258 /* 003259 ** Get a reference to pPage1 of the database file. This will 003260 ** also acquire a readlock on that file. 003261 ** 003262 ** SQLITE_OK is returned on success. If the file is not a 003263 ** well-formed database file, then SQLITE_CORRUPT is returned. 003264 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 003265 ** is returned if we run out of memory. 003266 */ 003267 static int lockBtree(BtShared *pBt){ 003268 int rc; /* Result code from subfunctions */ 003269 MemPage *pPage1; /* Page 1 of the database file */ 003270 u32 nPage; /* Number of pages in the database */ 003271 u32 nPageFile = 0; /* Number of pages in the database file */ 003272 003273 assert( sqlite3_mutex_held(pBt->mutex) ); 003274 assert( pBt->pPage1==0 ); 003275 rc = sqlite3PagerSharedLock(pBt->pPager); 003276 if( rc!=SQLITE_OK ) return rc; 003277 rc = btreeGetPage(pBt, 1, &pPage1, 0); 003278 if( rc!=SQLITE_OK ) return rc; 003279 003280 /* Do some checking to help insure the file we opened really is 003281 ** a valid database file. 003282 */ 003283 nPage = get4byte(28+(u8*)pPage1->aData); 003284 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile); 003285 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){ 003286 nPage = nPageFile; 003287 } 003288 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){ 003289 nPage = 0; 003290 } 003291 if( nPage>0 ){ 003292 u32 pageSize; 003293 u32 usableSize; 003294 u8 *page1 = pPage1->aData; 003295 rc = SQLITE_NOTADB; 003296 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins 003297 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 003298 ** 61 74 20 33 00. */ 003299 if( memcmp(page1, zMagicHeader, 16)!=0 ){ 003300 goto page1_init_failed; 003301 } 003302 003303 #ifdef SQLITE_OMIT_WAL 003304 if( page1[18]>1 ){ 003305 pBt->btsFlags |= BTS_READ_ONLY; 003306 } 003307 if( page1[19]>1 ){ 003308 goto page1_init_failed; 003309 } 003310 #else 003311 if( page1[18]>2 ){ 003312 pBt->btsFlags |= BTS_READ_ONLY; 003313 } 003314 if( page1[19]>2 ){ 003315 goto page1_init_failed; 003316 } 003317 003318 /* If the read version is set to 2, this database should be accessed 003319 ** in WAL mode. If the log is not already open, open it now. Then 003320 ** return SQLITE_OK and return without populating BtShared.pPage1. 003321 ** The caller detects this and calls this function again. This is 003322 ** required as the version of page 1 currently in the page1 buffer 003323 ** may not be the latest version - there may be a newer one in the log 003324 ** file. 003325 */ 003326 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){ 003327 int isOpen = 0; 003328 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen); 003329 if( rc!=SQLITE_OK ){ 003330 goto page1_init_failed; 003331 }else{ 003332 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1); 003333 if( isOpen==0 ){ 003334 releasePageOne(pPage1); 003335 return SQLITE_OK; 003336 } 003337 } 003338 rc = SQLITE_NOTADB; 003339 }else{ 003340 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1); 003341 } 003342 #endif 003343 003344 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload 003345 ** fractions and the leaf payload fraction values must be 64, 32, and 32. 003346 ** 003347 ** The original design allowed these amounts to vary, but as of 003348 ** version 3.6.0, we require them to be fixed. 003349 */ 003350 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ 003351 goto page1_init_failed; 003352 } 003353 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is 003354 ** determined by the 2-byte integer located at an offset of 16 bytes from 003355 ** the beginning of the database file. */ 003356 pageSize = (page1[16]<<8) | (page1[17]<<16); 003357 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two 003358 ** between 512 and 65536 inclusive. */ 003359 if( ((pageSize-1)&pageSize)!=0 003360 || pageSize>SQLITE_MAX_PAGE_SIZE 003361 || pageSize<=256 003362 ){ 003363 goto page1_init_failed; 003364 } 003365 assert( (pageSize & 7)==0 ); 003366 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte 003367 ** integer at offset 20 is the number of bytes of space at the end of 003368 ** each page to reserve for extensions. 003369 ** 003370 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is 003371 ** determined by the one-byte unsigned integer found at an offset of 20 003372 ** into the database file header. */ 003373 usableSize = pageSize - page1[20]; 003374 if( (u32)pageSize!=pBt->pageSize ){ 003375 /* After reading the first page of the database assuming a page size 003376 ** of BtShared.pageSize, we have discovered that the page-size is 003377 ** actually pageSize. Unlock the database, leave pBt->pPage1 at 003378 ** zero and return SQLITE_OK. The caller will call this function 003379 ** again with the correct page-size. 003380 */ 003381 releasePageOne(pPage1); 003382 pBt->usableSize = usableSize; 003383 pBt->pageSize = pageSize; 003384 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003385 freeTempSpace(pBt); 003386 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, 003387 pageSize-usableSize); 003388 return rc; 003389 } 003390 if( nPage>nPageFile ){ 003391 if( sqlite3WritableSchema(pBt->db)==0 ){ 003392 rc = SQLITE_CORRUPT_BKPT; 003393 goto page1_init_failed; 003394 }else{ 003395 nPage = nPageFile; 003396 } 003397 } 003398 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to 003399 ** be less than 480. In other words, if the page size is 512, then the 003400 ** reserved space size cannot exceed 32. */ 003401 if( usableSize<480 ){ 003402 goto page1_init_failed; 003403 } 003404 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003405 pBt->pageSize = pageSize; 003406 pBt->usableSize = usableSize; 003407 #ifndef SQLITE_OMIT_AUTOVACUUM 003408 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); 003409 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); 003410 #endif 003411 } 003412 003413 /* maxLocal is the maximum amount of payload to store locally for 003414 ** a cell. Make sure it is small enough so that at least minFanout 003415 ** cells can will fit on one page. We assume a 10-byte page header. 003416 ** Besides the payload, the cell must store: 003417 ** 2-byte pointer to the cell 003418 ** 4-byte child pointer 003419 ** 9-byte nKey value 003420 ** 4-byte nData value 003421 ** 4-byte overflow page pointer 003422 ** So a cell consists of a 2-byte pointer, a header which is as much as 003423 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow 003424 ** page pointer. 003425 */ 003426 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23); 003427 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23); 003428 pBt->maxLeaf = (u16)(pBt->usableSize - 35); 003429 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23); 003430 if( pBt->maxLocal>127 ){ 003431 pBt->max1bytePayload = 127; 003432 }else{ 003433 pBt->max1bytePayload = (u8)pBt->maxLocal; 003434 } 003435 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); 003436 pBt->pPage1 = pPage1; 003437 pBt->nPage = nPage; 003438 return SQLITE_OK; 003439 003440 page1_init_failed: 003441 releasePageOne(pPage1); 003442 pBt->pPage1 = 0; 003443 return rc; 003444 } 003445 003446 #ifndef NDEBUG 003447 /* 003448 ** Return the number of cursors open on pBt. This is for use 003449 ** in assert() expressions, so it is only compiled if NDEBUG is not 003450 ** defined. 003451 ** 003452 ** Only write cursors are counted if wrOnly is true. If wrOnly is 003453 ** false then all cursors are counted. 003454 ** 003455 ** For the purposes of this routine, a cursor is any cursor that 003456 ** is capable of reading or writing to the database. Cursors that 003457 ** have been tripped into the CURSOR_FAULT state are not counted. 003458 */ 003459 static int countValidCursors(BtShared *pBt, int wrOnly){ 003460 BtCursor *pCur; 003461 int r = 0; 003462 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 003463 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0) 003464 && pCur->eState!=CURSOR_FAULT ) r++; 003465 } 003466 return r; 003467 } 003468 #endif 003469 003470 /* 003471 ** If there are no outstanding cursors and we are not in the middle 003472 ** of a transaction but there is a read lock on the database, then 003473 ** this routine unrefs the first page of the database file which 003474 ** has the effect of releasing the read lock. 003475 ** 003476 ** If there is a transaction in progress, this routine is a no-op. 003477 */ 003478 static void unlockBtreeIfUnused(BtShared *pBt){ 003479 assert( sqlite3_mutex_held(pBt->mutex) ); 003480 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE ); 003481 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ 003482 MemPage *pPage1 = pBt->pPage1; 003483 assert( pPage1->aData ); 003484 assert( sqlite3PagerRefcount(pBt->pPager)==1 ); 003485 pBt->pPage1 = 0; 003486 releasePageOne(pPage1); 003487 } 003488 } 003489 003490 /* 003491 ** If pBt points to an empty file then convert that empty file 003492 ** into a new empty database by initializing the first page of 003493 ** the database. 003494 */ 003495 static int newDatabase(BtShared *pBt){ 003496 MemPage *pP1; 003497 unsigned char *data; 003498 int rc; 003499 003500 assert( sqlite3_mutex_held(pBt->mutex) ); 003501 if( pBt->nPage>0 ){ 003502 return SQLITE_OK; 003503 } 003504 pP1 = pBt->pPage1; 003505 assert( pP1!=0 ); 003506 data = pP1->aData; 003507 rc = sqlite3PagerWrite(pP1->pDbPage); 003508 if( rc ) return rc; 003509 memcpy(data, zMagicHeader, sizeof(zMagicHeader)); 003510 assert( sizeof(zMagicHeader)==16 ); 003511 data[16] = (u8)((pBt->pageSize>>8)&0xff); 003512 data[17] = (u8)((pBt->pageSize>>16)&0xff); 003513 data[18] = 1; 003514 data[19] = 1; 003515 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); 003516 data[20] = (u8)(pBt->pageSize - pBt->usableSize); 003517 data[21] = 64; 003518 data[22] = 32; 003519 data[23] = 32; 003520 memset(&data[24], 0, 100-24); 003521 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); 003522 pBt->btsFlags |= BTS_PAGESIZE_FIXED; 003523 #ifndef SQLITE_OMIT_AUTOVACUUM 003524 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); 003525 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); 003526 put4byte(&data[36 + 4*4], pBt->autoVacuum); 003527 put4byte(&data[36 + 7*4], pBt->incrVacuum); 003528 #endif 003529 pBt->nPage = 1; 003530 data[31] = 1; 003531 return SQLITE_OK; 003532 } 003533 003534 /* 003535 ** Initialize the first page of the database file (creating a database 003536 ** consisting of a single page and no schema objects). Return SQLITE_OK 003537 ** if successful, or an SQLite error code otherwise. 003538 */ 003539 int sqlite3BtreeNewDb(Btree *p){ 003540 int rc; 003541 sqlite3BtreeEnter(p); 003542 p->pBt->nPage = 0; 003543 rc = newDatabase(p->pBt); 003544 sqlite3BtreeLeave(p); 003545 return rc; 003546 } 003547 003548 /* 003549 ** Attempt to start a new transaction. A write-transaction 003550 ** is started if the second argument is nonzero, otherwise a read- 003551 ** transaction. If the second argument is 2 or more and exclusive 003552 ** transaction is started, meaning that no other process is allowed 003553 ** to access the database. A preexisting transaction may not be 003554 ** upgraded to exclusive by calling this routine a second time - the 003555 ** exclusivity flag only works for a new transaction. 003556 ** 003557 ** A write-transaction must be started before attempting any 003558 ** changes to the database. None of the following routines 003559 ** will work unless a transaction is started first: 003560 ** 003561 ** sqlite3BtreeCreateTable() 003562 ** sqlite3BtreeCreateIndex() 003563 ** sqlite3BtreeClearTable() 003564 ** sqlite3BtreeDropTable() 003565 ** sqlite3BtreeInsert() 003566 ** sqlite3BtreeDelete() 003567 ** sqlite3BtreeUpdateMeta() 003568 ** 003569 ** If an initial attempt to acquire the lock fails because of lock contention 003570 ** and the database was previously unlocked, then invoke the busy handler 003571 ** if there is one. But if there was previously a read-lock, do not 003572 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is 003573 ** returned when there is already a read-lock in order to avoid a deadlock. 003574 ** 003575 ** Suppose there are two processes A and B. A has a read lock and B has 003576 ** a reserved lock. B tries to promote to exclusive but is blocked because 003577 ** of A's read lock. A tries to promote to reserved but is blocked by B. 003578 ** One or the other of the two processes must give way or there can be 003579 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback 003580 ** when A already has a read lock, we encourage A to give up and let B 003581 ** proceed. 003582 */ 003583 static SQLITE_NOINLINE int btreeBeginTrans( 003584 Btree *p, /* The btree in which to start the transaction */ 003585 int wrflag, /* True to start a write transaction */ 003586 int *pSchemaVersion /* Put schema version number here, if not NULL */ 003587 ){ 003588 BtShared *pBt = p->pBt; 003589 Pager *pPager = pBt->pPager; 003590 int rc = SQLITE_OK; 003591 003592 sqlite3BtreeEnter(p); 003593 btreeIntegrity(p); 003594 003595 /* If the btree is already in a write-transaction, or it 003596 ** is already in a read-transaction and a read-transaction 003597 ** is requested, this is a no-op. 003598 */ 003599 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ 003600 goto trans_begun; 003601 } 003602 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 ); 003603 003604 if( (p->db->flags & SQLITE_ResetDatabase) 003605 && sqlite3PagerIsreadonly(pPager)==0 003606 ){ 003607 pBt->btsFlags &= ~BTS_READ_ONLY; 003608 } 003609 003610 /* Write transactions are not possible on a read-only database */ 003611 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){ 003612 rc = SQLITE_READONLY; 003613 goto trans_begun; 003614 } 003615 003616 #ifndef SQLITE_OMIT_SHARED_CACHE 003617 { 003618 sqlite3 *pBlock = 0; 003619 /* If another database handle has already opened a write transaction 003620 ** on this shared-btree structure and a second write transaction is 003621 ** requested, return SQLITE_LOCKED. 003622 */ 003623 if( (wrflag && pBt->inTransaction==TRANS_WRITE) 003624 || (pBt->btsFlags & BTS_PENDING)!=0 003625 ){ 003626 pBlock = pBt->pWriter->db; 003627 }else if( wrflag>1 ){ 003628 BtLock *pIter; 003629 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ 003630 if( pIter->pBtree!=p ){ 003631 pBlock = pIter->pBtree->db; 003632 break; 003633 } 003634 } 003635 } 003636 if( pBlock ){ 003637 sqlite3ConnectionBlocked(p->db, pBlock); 003638 rc = SQLITE_LOCKED_SHAREDCACHE; 003639 goto trans_begun; 003640 } 003641 } 003642 #endif 003643 003644 /* Any read-only or read-write transaction implies a read-lock on 003645 ** page 1. So if some other shared-cache client already has a write-lock 003646 ** on page 1, the transaction cannot be opened. */ 003647 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 003648 if( SQLITE_OK!=rc ) goto trans_begun; 003649 003650 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY; 003651 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY; 003652 do { 003653 sqlite3PagerWalDb(pPager, p->db); 003654 003655 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003656 /* If transitioning from no transaction directly to a write transaction, 003657 ** block for the WRITER lock first if possible. */ 003658 if( pBt->pPage1==0 && wrflag ){ 003659 assert( pBt->inTransaction==TRANS_NONE ); 003660 rc = sqlite3PagerWalWriteLock(pPager, 1); 003661 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break; 003662 } 003663 #endif 003664 003665 /* Call lockBtree() until either pBt->pPage1 is populated or 003666 ** lockBtree() returns something other than SQLITE_OK. lockBtree() 003667 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after 003668 ** reading page 1 it discovers that the page-size of the database 003669 ** file is not pBt->pageSize. In this case lockBtree() will update 003670 ** pBt->pageSize to the page-size of the file on disk. 003671 */ 003672 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); 003673 003674 if( rc==SQLITE_OK && wrflag ){ 003675 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){ 003676 rc = SQLITE_READONLY; 003677 }else{ 003678 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db)); 003679 if( rc==SQLITE_OK ){ 003680 rc = newDatabase(pBt); 003681 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){ 003682 /* if there was no transaction opened when this function was 003683 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error 003684 ** code to SQLITE_BUSY. */ 003685 rc = SQLITE_BUSY; 003686 } 003687 } 003688 } 003689 003690 if( rc!=SQLITE_OK ){ 003691 (void)sqlite3PagerWalWriteLock(pPager, 0); 003692 unlockBtreeIfUnused(pBt); 003693 } 003694 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && 003695 btreeInvokeBusyHandler(pBt) ); 003696 sqlite3PagerWalDb(pPager, 0); 003697 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT 003698 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY; 003699 #endif 003700 003701 if( rc==SQLITE_OK ){ 003702 if( p->inTrans==TRANS_NONE ){ 003703 pBt->nTransaction++; 003704 #ifndef SQLITE_OMIT_SHARED_CACHE 003705 if( p->sharable ){ 003706 assert( p->lock.pBtree==p && p->lock.iTable==1 ); 003707 p->lock.eLock = READ_LOCK; 003708 p->lock.pNext = pBt->pLock; 003709 pBt->pLock = &p->lock; 003710 } 003711 #endif 003712 } 003713 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); 003714 if( p->inTrans>pBt->inTransaction ){ 003715 pBt->inTransaction = p->inTrans; 003716 } 003717 if( wrflag ){ 003718 MemPage *pPage1 = pBt->pPage1; 003719 #ifndef SQLITE_OMIT_SHARED_CACHE 003720 assert( !pBt->pWriter ); 003721 pBt->pWriter = p; 003722 pBt->btsFlags &= ~BTS_EXCLUSIVE; 003723 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE; 003724 #endif 003725 003726 /* If the db-size header field is incorrect (as it may be if an old 003727 ** client has been writing the database file), update it now. Doing 003728 ** this sooner rather than later means the database size can safely 003729 ** re-read the database size from page 1 if a savepoint or transaction 003730 ** rollback occurs within the transaction. 003731 */ 003732 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){ 003733 rc = sqlite3PagerWrite(pPage1->pDbPage); 003734 if( rc==SQLITE_OK ){ 003735 put4byte(&pPage1->aData[28], pBt->nPage); 003736 } 003737 } 003738 } 003739 } 003740 003741 trans_begun: 003742 if( rc==SQLITE_OK ){ 003743 if( pSchemaVersion ){ 003744 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003745 } 003746 if( wrflag ){ 003747 /* This call makes sure that the pager has the correct number of 003748 ** open savepoints. If the second parameter is greater than 0 and 003749 ** the sub-journal is not already open, then it will be opened here. 003750 */ 003751 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint); 003752 } 003753 } 003754 003755 btreeIntegrity(p); 003756 sqlite3BtreeLeave(p); 003757 return rc; 003758 } 003759 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){ 003760 BtShared *pBt; 003761 if( p->sharable 003762 || p->inTrans==TRANS_NONE 003763 || (p->inTrans==TRANS_READ && wrflag!=0) 003764 ){ 003765 return btreeBeginTrans(p,wrflag,pSchemaVersion); 003766 } 003767 pBt = p->pBt; 003768 if( pSchemaVersion ){ 003769 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]); 003770 } 003771 if( wrflag ){ 003772 /* This call makes sure that the pager has the correct number of 003773 ** open savepoints. If the second parameter is greater than 0 and 003774 ** the sub-journal is not already open, then it will be opened here. 003775 */ 003776 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); 003777 }else{ 003778 return SQLITE_OK; 003779 } 003780 } 003781 003782 #ifndef SQLITE_OMIT_AUTOVACUUM 003783 003784 /* 003785 ** Set the pointer-map entries for all children of page pPage. Also, if 003786 ** pPage contains cells that point to overflow pages, set the pointer 003787 ** map entries for the overflow pages as well. 003788 */ 003789 static int setChildPtrmaps(MemPage *pPage){ 003790 int i; /* Counter variable */ 003791 int nCell; /* Number of cells in page pPage */ 003792 int rc; /* Return code */ 003793 BtShared *pBt = pPage->pBt; 003794 Pgno pgno = pPage->pgno; 003795 003796 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003797 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003798 if( rc!=SQLITE_OK ) return rc; 003799 nCell = pPage->nCell; 003800 003801 for(i=0; i<nCell; i++){ 003802 u8 *pCell = findCell(pPage, i); 003803 003804 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc); 003805 003806 if( !pPage->leaf ){ 003807 Pgno childPgno = get4byte(pCell); 003808 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003809 } 003810 } 003811 003812 if( !pPage->leaf ){ 003813 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 003814 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); 003815 } 003816 003817 return rc; 003818 } 003819 003820 /* 003821 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so 003822 ** that it points to iTo. Parameter eType describes the type of pointer to 003823 ** be modified, as follows: 003824 ** 003825 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child 003826 ** page of pPage. 003827 ** 003828 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow 003829 ** page pointed to by one of the cells on pPage. 003830 ** 003831 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next 003832 ** overflow page in the list. 003833 */ 003834 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ 003835 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 003836 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 003837 if( eType==PTRMAP_OVERFLOW2 ){ 003838 /* The pointer is always the first 4 bytes of the page in this case. */ 003839 if( get4byte(pPage->aData)!=iFrom ){ 003840 return SQLITE_CORRUPT_PAGE(pPage); 003841 } 003842 put4byte(pPage->aData, iTo); 003843 }else{ 003844 int i; 003845 int nCell; 003846 int rc; 003847 003848 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage); 003849 if( rc ) return rc; 003850 nCell = pPage->nCell; 003851 003852 for(i=0; i<nCell; i++){ 003853 u8 *pCell = findCell(pPage, i); 003854 if( eType==PTRMAP_OVERFLOW1 ){ 003855 CellInfo info; 003856 pPage->xParseCell(pPage, pCell, &info); 003857 if( info.nLocal<info.nPayload ){ 003858 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){ 003859 return SQLITE_CORRUPT_PAGE(pPage); 003860 } 003861 if( iFrom==get4byte(pCell+info.nSize-4) ){ 003862 put4byte(pCell+info.nSize-4, iTo); 003863 break; 003864 } 003865 } 003866 }else{ 003867 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ 003868 return SQLITE_CORRUPT_PAGE(pPage); 003869 } 003870 if( get4byte(pCell)==iFrom ){ 003871 put4byte(pCell, iTo); 003872 break; 003873 } 003874 } 003875 } 003876 003877 if( i==nCell ){ 003878 if( eType!=PTRMAP_BTREE || 003879 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ 003880 return SQLITE_CORRUPT_PAGE(pPage); 003881 } 003882 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); 003883 } 003884 } 003885 return SQLITE_OK; 003886 } 003887 003888 003889 /* 003890 ** Move the open database page pDbPage to location iFreePage in the 003891 ** database. The pDbPage reference remains valid. 003892 ** 003893 ** The isCommit flag indicates that there is no need to remember that 003894 ** the journal needs to be sync()ed before database page pDbPage->pgno 003895 ** can be written to. The caller has already promised not to write to that 003896 ** page. 003897 */ 003898 static int relocatePage( 003899 BtShared *pBt, /* Btree */ 003900 MemPage *pDbPage, /* Open page to move */ 003901 u8 eType, /* Pointer map 'type' entry for pDbPage */ 003902 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */ 003903 Pgno iFreePage, /* The location to move pDbPage to */ 003904 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */ 003905 ){ 003906 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */ 003907 Pgno iDbPage = pDbPage->pgno; 003908 Pager *pPager = pBt->pPager; 003909 int rc; 003910 003911 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 003912 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); 003913 assert( sqlite3_mutex_held(pBt->mutex) ); 003914 assert( pDbPage->pBt==pBt ); 003915 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT; 003916 003917 /* Move page iDbPage from its current location to page number iFreePage */ 003918 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n", 003919 iDbPage, iFreePage, iPtrPage, eType)); 003920 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); 003921 if( rc!=SQLITE_OK ){ 003922 return rc; 003923 } 003924 pDbPage->pgno = iFreePage; 003925 003926 /* If pDbPage was a btree-page, then it may have child pages and/or cells 003927 ** that point to overflow pages. The pointer map entries for all these 003928 ** pages need to be changed. 003929 ** 003930 ** If pDbPage is an overflow page, then the first 4 bytes may store a 003931 ** pointer to a subsequent overflow page. If this is the case, then 003932 ** the pointer map needs to be updated for the subsequent overflow page. 003933 */ 003934 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ 003935 rc = setChildPtrmaps(pDbPage); 003936 if( rc!=SQLITE_OK ){ 003937 return rc; 003938 } 003939 }else{ 003940 Pgno nextOvfl = get4byte(pDbPage->aData); 003941 if( nextOvfl!=0 ){ 003942 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); 003943 if( rc!=SQLITE_OK ){ 003944 return rc; 003945 } 003946 } 003947 } 003948 003949 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so 003950 ** that it points at iFreePage. Also fix the pointer map entry for 003951 ** iPtrPage. 003952 */ 003953 if( eType!=PTRMAP_ROOTPAGE ){ 003954 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); 003955 if( rc!=SQLITE_OK ){ 003956 return rc; 003957 } 003958 rc = sqlite3PagerWrite(pPtrPage->pDbPage); 003959 if( rc!=SQLITE_OK ){ 003960 releasePage(pPtrPage); 003961 return rc; 003962 } 003963 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); 003964 releasePage(pPtrPage); 003965 if( rc==SQLITE_OK ){ 003966 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); 003967 } 003968 } 003969 return rc; 003970 } 003971 003972 /* Forward declaration required by incrVacuumStep(). */ 003973 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); 003974 003975 /* 003976 ** Perform a single step of an incremental-vacuum. If successful, return 003977 ** SQLITE_OK. If there is no work to do (and therefore no point in 003978 ** calling this function again), return SQLITE_DONE. Or, if an error 003979 ** occurs, return some other error code. 003980 ** 003981 ** More specifically, this function attempts to re-organize the database so 003982 ** that the last page of the file currently in use is no longer in use. 003983 ** 003984 ** Parameter nFin is the number of pages that this database would contain 003985 ** were this function called until it returns SQLITE_DONE. 003986 ** 003987 ** If the bCommit parameter is non-zero, this function assumes that the 003988 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 003989 ** or an error. bCommit is passed true for an auto-vacuum-on-commit 003990 ** operation, or false for an incremental vacuum. 003991 */ 003992 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ 003993 Pgno nFreeList; /* Number of pages still on the free-list */ 003994 int rc; 003995 003996 assert( sqlite3_mutex_held(pBt->mutex) ); 003997 assert( iLastPg>nFin ); 003998 003999 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ 004000 u8 eType; 004001 Pgno iPtrPage; 004002 004003 nFreeList = get4byte(&pBt->pPage1->aData[36]); 004004 if( nFreeList==0 ){ 004005 return SQLITE_DONE; 004006 } 004007 004008 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); 004009 if( rc!=SQLITE_OK ){ 004010 return rc; 004011 } 004012 if( eType==PTRMAP_ROOTPAGE ){ 004013 return SQLITE_CORRUPT_BKPT; 004014 } 004015 004016 if( eType==PTRMAP_FREEPAGE ){ 004017 if( bCommit==0 ){ 004018 /* Remove the page from the files free-list. This is not required 004019 ** if bCommit is non-zero. In that case, the free-list will be 004020 ** truncated to zero after this function returns, so it doesn't 004021 ** matter if it still contains some garbage entries. 004022 */ 004023 Pgno iFreePg; 004024 MemPage *pFreePg; 004025 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT); 004026 if( rc!=SQLITE_OK ){ 004027 return rc; 004028 } 004029 assert( iFreePg==iLastPg ); 004030 releasePage(pFreePg); 004031 } 004032 } else { 004033 Pgno iFreePg; /* Index of free page to move pLastPg to */ 004034 MemPage *pLastPg; 004035 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */ 004036 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */ 004037 004038 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); 004039 if( rc!=SQLITE_OK ){ 004040 return rc; 004041 } 004042 004043 /* If bCommit is zero, this loop runs exactly once and page pLastPg 004044 ** is swapped with the first free page pulled off the free list. 004045 ** 004046 ** On the other hand, if bCommit is greater than zero, then keep 004047 ** looping until a free-page located within the first nFin pages 004048 ** of the file is found. 004049 */ 004050 if( bCommit==0 ){ 004051 eMode = BTALLOC_LE; 004052 iNear = nFin; 004053 } 004054 do { 004055 MemPage *pFreePg; 004056 Pgno dbSize = btreePagecount(pBt); 004057 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); 004058 if( rc!=SQLITE_OK ){ 004059 releasePage(pLastPg); 004060 return rc; 004061 } 004062 releasePage(pFreePg); 004063 if( iFreePg>dbSize ){ 004064 releasePage(pLastPg); 004065 return SQLITE_CORRUPT_BKPT; 004066 } 004067 }while( bCommit && iFreePg>nFin ); 004068 assert( iFreePg<iLastPg ); 004069 004070 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit); 004071 releasePage(pLastPg); 004072 if( rc!=SQLITE_OK ){ 004073 return rc; 004074 } 004075 } 004076 } 004077 004078 if( bCommit==0 ){ 004079 do { 004080 iLastPg--; 004081 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) ); 004082 pBt->bDoTruncate = 1; 004083 pBt->nPage = iLastPg; 004084 } 004085 return SQLITE_OK; 004086 } 004087 004088 /* 004089 ** The database opened by the first argument is an auto-vacuum database 004090 ** nOrig pages in size containing nFree free pages. Return the expected 004091 ** size of the database in pages following an auto-vacuum operation. 004092 */ 004093 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){ 004094 int nEntry; /* Number of entries on one ptrmap page */ 004095 Pgno nPtrmap; /* Number of PtrMap pages to be freed */ 004096 Pgno nFin; /* Return value */ 004097 004098 nEntry = pBt->usableSize/5; 004099 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; 004100 nFin = nOrig - nFree - nPtrmap; 004101 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ 004102 nFin--; 004103 } 004104 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ 004105 nFin--; 004106 } 004107 004108 return nFin; 004109 } 004110 004111 /* 004112 ** A write-transaction must be opened before calling this function. 004113 ** It performs a single unit of work towards an incremental vacuum. 004114 ** 004115 ** If the incremental vacuum is finished after this function has run, 004116 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, 004117 ** SQLITE_OK is returned. Otherwise an SQLite error code. 004118 */ 004119 int sqlite3BtreeIncrVacuum(Btree *p){ 004120 int rc; 004121 BtShared *pBt = p->pBt; 004122 004123 sqlite3BtreeEnter(p); 004124 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); 004125 if( !pBt->autoVacuum ){ 004126 rc = SQLITE_DONE; 004127 }else{ 004128 Pgno nOrig = btreePagecount(pBt); 004129 Pgno nFree = get4byte(&pBt->pPage1->aData[36]); 004130 Pgno nFin = finalDbSize(pBt, nOrig, nFree); 004131 004132 if( nOrig<nFin || nFree>=nOrig ){ 004133 rc = SQLITE_CORRUPT_BKPT; 004134 }else if( nFree>0 ){ 004135 rc = saveAllCursors(pBt, 0, 0); 004136 if( rc==SQLITE_OK ){ 004137 invalidateAllOverflowCache(pBt); 004138 rc = incrVacuumStep(pBt, nFin, nOrig, 0); 004139 } 004140 if( rc==SQLITE_OK ){ 004141 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004142 put4byte(&pBt->pPage1->aData[28], pBt->nPage); 004143 } 004144 }else{ 004145 rc = SQLITE_DONE; 004146 } 004147 } 004148 sqlite3BtreeLeave(p); 004149 return rc; 004150 } 004151 004152 /* 004153 ** This routine is called prior to sqlite3PagerCommit when a transaction 004154 ** is committed for an auto-vacuum database. 004155 */ 004156 static int autoVacuumCommit(Btree *p){ 004157 int rc = SQLITE_OK; 004158 Pager *pPager; 004159 BtShared *pBt; 004160 sqlite3 *db; 004161 VVA_ONLY( int nRef ); 004162 004163 assert( p!=0 ); 004164 pBt = p->pBt; 004165 pPager = pBt->pPager; 004166 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); ) 004167 004168 assert( sqlite3_mutex_held(pBt->mutex) ); 004169 invalidateAllOverflowCache(pBt); 004170 assert(pBt->autoVacuum); 004171 if( !pBt->incrVacuum ){ 004172 Pgno nFin; /* Number of pages in database after autovacuuming */ 004173 Pgno nFree; /* Number of pages on the freelist initially */ 004174 Pgno nVac; /* Number of pages to vacuum */ 004175 Pgno iFree; /* The next page to be freed */ 004176 Pgno nOrig; /* Database size before freeing */ 004177 004178 nOrig = btreePagecount(pBt); 004179 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ 004180 /* It is not possible to create a database for which the final page 004181 ** is either a pointer-map page or the pending-byte page. If one 004182 ** is encountered, this indicates corruption. 004183 */ 004184 return SQLITE_CORRUPT_BKPT; 004185 } 004186 004187 nFree = get4byte(&pBt->pPage1->aData[36]); 004188 db = p->db; 004189 if( db->xAutovacPages ){ 004190 int iDb; 004191 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){ 004192 if( db->aDb[iDb].pBt==p ) break; 004193 } 004194 nVac = db->xAutovacPages( 004195 db->pAutovacPagesArg, 004196 db->aDb[iDb].zDbSName, 004197 nOrig, 004198 nFree, 004199 pBt->pageSize 004200 ); 004201 if( nVac>nFree ){ 004202 nVac = nFree; 004203 } 004204 if( nVac==0 ){ 004205 return SQLITE_OK; 004206 } 004207 }else{ 004208 nVac = nFree; 004209 } 004210 nFin = finalDbSize(pBt, nOrig, nVac); 004211 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; 004212 if( nFin<nOrig ){ 004213 rc = saveAllCursors(pBt, 0, 0); 004214 } 004215 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ 004216 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree); 004217 } 004218 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ 004219 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 004220 if( nVac==nFree ){ 004221 put4byte(&pBt->pPage1->aData[32], 0); 004222 put4byte(&pBt->pPage1->aData[36], 0); 004223 } 004224 put4byte(&pBt->pPage1->aData[28], nFin); 004225 pBt->bDoTruncate = 1; 004226 pBt->nPage = nFin; 004227 } 004228 if( rc!=SQLITE_OK ){ 004229 sqlite3PagerRollback(pPager); 004230 } 004231 } 004232 004233 assert( nRef>=sqlite3PagerRefcount(pPager) ); 004234 return rc; 004235 } 004236 004237 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ 004238 # define setChildPtrmaps(x) SQLITE_OK 004239 #endif 004240 004241 /* 004242 ** This routine does the first phase of a two-phase commit. This routine 004243 ** causes a rollback journal to be created (if it does not already exist) 004244 ** and populated with enough information so that if a power loss occurs 004245 ** the database can be restored to its original state by playing back 004246 ** the journal. Then the contents of the journal are flushed out to 004247 ** the disk. After the journal is safely on oxide, the changes to the 004248 ** database are written into the database file and flushed to oxide. 004249 ** At the end of this call, the rollback journal still exists on the 004250 ** disk and we are still holding all locks, so the transaction has not 004251 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the 004252 ** commit process. 004253 ** 004254 ** This call is a no-op if no write-transaction is currently active on pBt. 004255 ** 004256 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to 004257 ** the name of a super-journal file that should be written into the 004258 ** individual journal file, or is NULL, indicating no super-journal file 004259 ** (single database transaction). 004260 ** 004261 ** When this is called, the super-journal should already have been 004262 ** created, populated with this journal pointer and synced to disk. 004263 ** 004264 ** Once this is routine has returned, the only thing required to commit 004265 ** the write-transaction for this database file is to delete the journal. 004266 */ 004267 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){ 004268 int rc = SQLITE_OK; 004269 if( p->inTrans==TRANS_WRITE ){ 004270 BtShared *pBt = p->pBt; 004271 sqlite3BtreeEnter(p); 004272 #ifndef SQLITE_OMIT_AUTOVACUUM 004273 if( pBt->autoVacuum ){ 004274 rc = autoVacuumCommit(p); 004275 if( rc!=SQLITE_OK ){ 004276 sqlite3BtreeLeave(p); 004277 return rc; 004278 } 004279 } 004280 if( pBt->bDoTruncate ){ 004281 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage); 004282 } 004283 #endif 004284 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0); 004285 sqlite3BtreeLeave(p); 004286 } 004287 return rc; 004288 } 004289 004290 /* 004291 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() 004292 ** at the conclusion of a transaction. 004293 */ 004294 static void btreeEndTransaction(Btree *p){ 004295 BtShared *pBt = p->pBt; 004296 sqlite3 *db = p->db; 004297 assert( sqlite3BtreeHoldsMutex(p) ); 004298 004299 #ifndef SQLITE_OMIT_AUTOVACUUM 004300 pBt->bDoTruncate = 0; 004301 #endif 004302 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){ 004303 /* If there are other active statements that belong to this database 004304 ** handle, downgrade to a read-only transaction. The other statements 004305 ** may still be reading from the database. */ 004306 downgradeAllSharedCacheTableLocks(p); 004307 p->inTrans = TRANS_READ; 004308 }else{ 004309 /* If the handle had any kind of transaction open, decrement the 004310 ** transaction count of the shared btree. If the transaction count 004311 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() 004312 ** call below will unlock the pager. */ 004313 if( p->inTrans!=TRANS_NONE ){ 004314 clearAllSharedCacheTableLocks(p); 004315 pBt->nTransaction--; 004316 if( 0==pBt->nTransaction ){ 004317 pBt->inTransaction = TRANS_NONE; 004318 } 004319 } 004320 004321 /* Set the current transaction state to TRANS_NONE and unlock the 004322 ** pager if this call closed the only read or write transaction. */ 004323 p->inTrans = TRANS_NONE; 004324 unlockBtreeIfUnused(pBt); 004325 } 004326 004327 btreeIntegrity(p); 004328 } 004329 004330 /* 004331 ** Commit the transaction currently in progress. 004332 ** 004333 ** This routine implements the second phase of a 2-phase commit. The 004334 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should 004335 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne() 004336 ** routine did all the work of writing information out to disk and flushing the 004337 ** contents so that they are written onto the disk platter. All this 004338 ** routine has to do is delete or truncate or zero the header in the 004339 ** the rollback journal (which causes the transaction to commit) and 004340 ** drop locks. 004341 ** 004342 ** Normally, if an error occurs while the pager layer is attempting to 004343 ** finalize the underlying journal file, this function returns an error and 004344 ** the upper layer will attempt a rollback. However, if the second argument 004345 ** is non-zero then this b-tree transaction is part of a multi-file 004346 ** transaction. In this case, the transaction has already been committed 004347 ** (by deleting a super-journal file) and the caller will ignore this 004348 ** functions return code. So, even if an error occurs in the pager layer, 004349 ** reset the b-tree objects internal state to indicate that the write 004350 ** transaction has been closed. This is quite safe, as the pager will have 004351 ** transitioned to the error state. 004352 ** 004353 ** This will release the write lock on the database file. If there 004354 ** are no active cursors, it also releases the read lock. 004355 */ 004356 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){ 004357 004358 if( p->inTrans==TRANS_NONE ) return SQLITE_OK; 004359 sqlite3BtreeEnter(p); 004360 btreeIntegrity(p); 004361 004362 /* If the handle has a write-transaction open, commit the shared-btrees 004363 ** transaction and set the shared state to TRANS_READ. 004364 */ 004365 if( p->inTrans==TRANS_WRITE ){ 004366 int rc; 004367 BtShared *pBt = p->pBt; 004368 assert( pBt->inTransaction==TRANS_WRITE ); 004369 assert( pBt->nTransaction>0 ); 004370 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); 004371 if( rc!=SQLITE_OK && bCleanup==0 ){ 004372 sqlite3BtreeLeave(p); 004373 return rc; 004374 } 004375 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */ 004376 pBt->inTransaction = TRANS_READ; 004377 btreeClearHasContent(pBt); 004378 } 004379 004380 btreeEndTransaction(p); 004381 sqlite3BtreeLeave(p); 004382 return SQLITE_OK; 004383 } 004384 004385 /* 004386 ** Do both phases of a commit. 004387 */ 004388 int sqlite3BtreeCommit(Btree *p){ 004389 int rc; 004390 sqlite3BtreeEnter(p); 004391 rc = sqlite3BtreeCommitPhaseOne(p, 0); 004392 if( rc==SQLITE_OK ){ 004393 rc = sqlite3BtreeCommitPhaseTwo(p, 0); 004394 } 004395 sqlite3BtreeLeave(p); 004396 return rc; 004397 } 004398 004399 /* 004400 ** This routine sets the state to CURSOR_FAULT and the error 004401 ** code to errCode for every cursor on any BtShared that pBtree 004402 ** references. Or if the writeOnly flag is set to 1, then only 004403 ** trip write cursors and leave read cursors unchanged. 004404 ** 004405 ** Every cursor is a candidate to be tripped, including cursors 004406 ** that belong to other database connections that happen to be 004407 ** sharing the cache with pBtree. 004408 ** 004409 ** This routine gets called when a rollback occurs. If the writeOnly 004410 ** flag is true, then only write-cursors need be tripped - read-only 004411 ** cursors save their current positions so that they may continue 004412 ** following the rollback. Or, if writeOnly is false, all cursors are 004413 ** tripped. In general, writeOnly is false if the transaction being 004414 ** rolled back modified the database schema. In this case b-tree root 004415 ** pages may be moved or deleted from the database altogether, making 004416 ** it unsafe for read cursors to continue. 004417 ** 004418 ** If the writeOnly flag is true and an error is encountered while 004419 ** saving the current position of a read-only cursor, all cursors, 004420 ** including all read-cursors are tripped. 004421 ** 004422 ** SQLITE_OK is returned if successful, or if an error occurs while 004423 ** saving a cursor position, an SQLite error code. 004424 */ 004425 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){ 004426 BtCursor *p; 004427 int rc = SQLITE_OK; 004428 004429 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 ); 004430 if( pBtree ){ 004431 sqlite3BtreeEnter(pBtree); 004432 for(p=pBtree->pBt->pCursor; p; p=p->pNext){ 004433 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){ 004434 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){ 004435 rc = saveCursorPosition(p); 004436 if( rc!=SQLITE_OK ){ 004437 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0); 004438 break; 004439 } 004440 } 004441 }else{ 004442 sqlite3BtreeClearCursor(p); 004443 p->eState = CURSOR_FAULT; 004444 p->skipNext = errCode; 004445 } 004446 btreeReleaseAllCursorPages(p); 004447 } 004448 sqlite3BtreeLeave(pBtree); 004449 } 004450 return rc; 004451 } 004452 004453 /* 004454 ** Set the pBt->nPage field correctly, according to the current 004455 ** state of the database. Assume pBt->pPage1 is valid. 004456 */ 004457 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){ 004458 int nPage = get4byte(&pPage1->aData[28]); 004459 testcase( nPage==0 ); 004460 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage); 004461 testcase( pBt->nPage!=(u32)nPage ); 004462 pBt->nPage = nPage; 004463 } 004464 004465 /* 004466 ** Rollback the transaction in progress. 004467 ** 004468 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped). 004469 ** Only write cursors are tripped if writeOnly is true but all cursors are 004470 ** tripped if writeOnly is false. Any attempt to use 004471 ** a tripped cursor will result in an error. 004472 ** 004473 ** This will release the write lock on the database file. If there 004474 ** are no active cursors, it also releases the read lock. 004475 */ 004476 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){ 004477 int rc; 004478 BtShared *pBt = p->pBt; 004479 MemPage *pPage1; 004480 004481 assert( writeOnly==1 || writeOnly==0 ); 004482 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK ); 004483 sqlite3BtreeEnter(p); 004484 if( tripCode==SQLITE_OK ){ 004485 rc = tripCode = saveAllCursors(pBt, 0, 0); 004486 if( rc ) writeOnly = 0; 004487 }else{ 004488 rc = SQLITE_OK; 004489 } 004490 if( tripCode ){ 004491 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly); 004492 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) ); 004493 if( rc2!=SQLITE_OK ) rc = rc2; 004494 } 004495 btreeIntegrity(p); 004496 004497 if( p->inTrans==TRANS_WRITE ){ 004498 int rc2; 004499 004500 assert( TRANS_WRITE==pBt->inTransaction ); 004501 rc2 = sqlite3PagerRollback(pBt->pPager); 004502 if( rc2!=SQLITE_OK ){ 004503 rc = rc2; 004504 } 004505 004506 /* The rollback may have destroyed the pPage1->aData value. So 004507 ** call btreeGetPage() on page 1 again to make 004508 ** sure pPage1->aData is set correctly. */ 004509 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ 004510 btreeSetNPage(pBt, pPage1); 004511 releasePageOne(pPage1); 004512 } 004513 assert( countValidCursors(pBt, 1)==0 ); 004514 pBt->inTransaction = TRANS_READ; 004515 btreeClearHasContent(pBt); 004516 } 004517 004518 btreeEndTransaction(p); 004519 sqlite3BtreeLeave(p); 004520 return rc; 004521 } 004522 004523 /* 004524 ** Start a statement subtransaction. The subtransaction can be rolled 004525 ** back independently of the main transaction. You must start a transaction 004526 ** before starting a subtransaction. The subtransaction is ended automatically 004527 ** if the main transaction commits or rolls back. 004528 ** 004529 ** Statement subtransactions are used around individual SQL statements 004530 ** that are contained within a BEGIN...COMMIT block. If a constraint 004531 ** error occurs within the statement, the effect of that one statement 004532 ** can be rolled back without having to rollback the entire transaction. 004533 ** 004534 ** A statement sub-transaction is implemented as an anonymous savepoint. The 004535 ** value passed as the second parameter is the total number of savepoints, 004536 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there 004537 ** are no active savepoints and no other statement-transactions open, 004538 ** iStatement is 1. This anonymous savepoint can be released or rolled back 004539 ** using the sqlite3BtreeSavepoint() function. 004540 */ 004541 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ 004542 int rc; 004543 BtShared *pBt = p->pBt; 004544 sqlite3BtreeEnter(p); 004545 assert( p->inTrans==TRANS_WRITE ); 004546 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004547 assert( iStatement>0 ); 004548 assert( iStatement>p->db->nSavepoint ); 004549 assert( pBt->inTransaction==TRANS_WRITE ); 004550 /* At the pager level, a statement transaction is a savepoint with 004551 ** an index greater than all savepoints created explicitly using 004552 ** SQL statements. It is illegal to open, release or rollback any 004553 ** such savepoints while the statement transaction savepoint is active. 004554 */ 004555 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); 004556 sqlite3BtreeLeave(p); 004557 return rc; 004558 } 004559 004560 /* 004561 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK 004562 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the 004563 ** savepoint identified by parameter iSavepoint, depending on the value 004564 ** of op. 004565 ** 004566 ** Normally, iSavepoint is greater than or equal to zero. However, if op is 004567 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 004568 ** contents of the entire transaction are rolled back. This is different 004569 ** from a normal transaction rollback, as no locks are released and the 004570 ** transaction remains open. 004571 */ 004572 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ 004573 int rc = SQLITE_OK; 004574 if( p && p->inTrans==TRANS_WRITE ){ 004575 BtShared *pBt = p->pBt; 004576 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); 004577 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); 004578 sqlite3BtreeEnter(p); 004579 if( op==SAVEPOINT_ROLLBACK ){ 004580 rc = saveAllCursors(pBt, 0, 0); 004581 } 004582 if( rc==SQLITE_OK ){ 004583 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); 004584 } 004585 if( rc==SQLITE_OK ){ 004586 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){ 004587 pBt->nPage = 0; 004588 } 004589 rc = newDatabase(pBt); 004590 btreeSetNPage(pBt, pBt->pPage1); 004591 004592 /* pBt->nPage might be zero if the database was corrupt when 004593 ** the transaction was started. Otherwise, it must be at least 1. */ 004594 assert( CORRUPT_DB || pBt->nPage>0 ); 004595 } 004596 sqlite3BtreeLeave(p); 004597 } 004598 return rc; 004599 } 004600 004601 /* 004602 ** Create a new cursor for the BTree whose root is on the page 004603 ** iTable. If a read-only cursor is requested, it is assumed that 004604 ** the caller already has at least a read-only transaction open 004605 ** on the database already. If a write-cursor is requested, then 004606 ** the caller is assumed to have an open write transaction. 004607 ** 004608 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only 004609 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor 004610 ** can be used for reading or for writing if other conditions for writing 004611 ** are also met. These are the conditions that must be met in order 004612 ** for writing to be allowed: 004613 ** 004614 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR 004615 ** 004616 ** 2: Other database connections that share the same pager cache 004617 ** but which are not in the READ_UNCOMMITTED state may not have 004618 ** cursors open with wrFlag==0 on the same table. Otherwise 004619 ** the changes made by this write cursor would be visible to 004620 ** the read cursors in the other database connection. 004621 ** 004622 ** 3: The database must be writable (not on read-only media) 004623 ** 004624 ** 4: There must be an active transaction. 004625 ** 004626 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR 004627 ** is set. If FORDELETE is set, that is a hint to the implementation that 004628 ** this cursor will only be used to seek to and delete entries of an index 004629 ** as part of a larger DELETE statement. The FORDELETE hint is not used by 004630 ** this implementation. But in a hypothetical alternative storage engine 004631 ** in which index entries are automatically deleted when corresponding table 004632 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE 004633 ** operations on this cursor can be no-ops and all READ operations can 004634 ** return a null row (2-bytes: 0x01 0x00). 004635 ** 004636 ** No checking is done to make sure that page iTable really is the 004637 ** root page of a b-tree. If it is not, then the cursor acquired 004638 ** will not work correctly. 004639 ** 004640 ** It is assumed that the sqlite3BtreeCursorZero() has been called 004641 ** on pCur to initialize the memory space prior to invoking this routine. 004642 */ 004643 static int btreeCursor( 004644 Btree *p, /* The btree */ 004645 Pgno iTable, /* Root page of table to open */ 004646 int wrFlag, /* 1 to write. 0 read-only */ 004647 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004648 BtCursor *pCur /* Space for new cursor */ 004649 ){ 004650 BtShared *pBt = p->pBt; /* Shared b-tree handle */ 004651 BtCursor *pX; /* Looping over other all cursors */ 004652 004653 assert( sqlite3BtreeHoldsMutex(p) ); 004654 assert( wrFlag==0 004655 || wrFlag==BTREE_WRCSR 004656 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 004657 ); 004658 004659 /* The following assert statements verify that if this is a sharable 004660 ** b-tree database, the connection is holding the required table locks, 004661 ** and that no other connection has any open cursor that conflicts with 004662 ** this lock. The iTable<1 term disables the check for corrupt schemas. */ 004663 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) 004664 || iTable<1 ); 004665 assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); 004666 004667 /* Assert that the caller has opened the required transaction. */ 004668 assert( p->inTrans>TRANS_NONE ); 004669 assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); 004670 assert( pBt->pPage1 && pBt->pPage1->aData ); 004671 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 ); 004672 004673 if( iTable<=1 ){ 004674 if( iTable<1 ){ 004675 return SQLITE_CORRUPT_BKPT; 004676 }else if( btreePagecount(pBt)==0 ){ 004677 assert( wrFlag==0 ); 004678 iTable = 0; 004679 } 004680 } 004681 004682 /* Now that no other errors can occur, finish filling in the BtCursor 004683 ** variables and link the cursor into the BtShared list. */ 004684 pCur->pgnoRoot = iTable; 004685 pCur->iPage = -1; 004686 pCur->pKeyInfo = pKeyInfo; 004687 pCur->pBtree = p; 004688 pCur->pBt = pBt; 004689 pCur->curFlags = 0; 004690 /* If there are two or more cursors on the same btree, then all such 004691 ** cursors *must* have the BTCF_Multiple flag set. */ 004692 for(pX=pBt->pCursor; pX; pX=pX->pNext){ 004693 if( pX->pgnoRoot==iTable ){ 004694 pX->curFlags |= BTCF_Multiple; 004695 pCur->curFlags = BTCF_Multiple; 004696 } 004697 } 004698 pCur->eState = CURSOR_INVALID; 004699 pCur->pNext = pBt->pCursor; 004700 pBt->pCursor = pCur; 004701 if( wrFlag ){ 004702 pCur->curFlags |= BTCF_WriteFlag; 004703 pCur->curPagerFlags = 0; 004704 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt); 004705 }else{ 004706 pCur->curPagerFlags = PAGER_GET_READONLY; 004707 } 004708 return SQLITE_OK; 004709 } 004710 static int btreeCursorWithLock( 004711 Btree *p, /* The btree */ 004712 Pgno iTable, /* Root page of table to open */ 004713 int wrFlag, /* 1 to write. 0 read-only */ 004714 struct KeyInfo *pKeyInfo, /* First arg to comparison function */ 004715 BtCursor *pCur /* Space for new cursor */ 004716 ){ 004717 int rc; 004718 sqlite3BtreeEnter(p); 004719 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004720 sqlite3BtreeLeave(p); 004721 return rc; 004722 } 004723 int sqlite3BtreeCursor( 004724 Btree *p, /* The btree */ 004725 Pgno iTable, /* Root page of table to open */ 004726 int wrFlag, /* 1 to write. 0 read-only */ 004727 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */ 004728 BtCursor *pCur /* Write new cursor here */ 004729 ){ 004730 if( p->sharable ){ 004731 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur); 004732 }else{ 004733 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); 004734 } 004735 } 004736 004737 /* 004738 ** Return the size of a BtCursor object in bytes. 004739 ** 004740 ** This interfaces is needed so that users of cursors can preallocate 004741 ** sufficient storage to hold a cursor. The BtCursor object is opaque 004742 ** to users so they cannot do the sizeof() themselves - they must call 004743 ** this routine. 004744 */ 004745 int sqlite3BtreeCursorSize(void){ 004746 return ROUND8(sizeof(BtCursor)); 004747 } 004748 004749 /* 004750 ** Initialize memory that will be converted into a BtCursor object. 004751 ** 004752 ** The simple approach here would be to memset() the entire object 004753 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays 004754 ** do not need to be zeroed and they are large, so we can save a lot 004755 ** of run-time by skipping the initialization of those elements. 004756 */ 004757 void sqlite3BtreeCursorZero(BtCursor *p){ 004758 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT)); 004759 } 004760 004761 /* 004762 ** Close a cursor. The read lock on the database file is released 004763 ** when the last cursor is closed. 004764 */ 004765 int sqlite3BtreeCloseCursor(BtCursor *pCur){ 004766 Btree *pBtree = pCur->pBtree; 004767 if( pBtree ){ 004768 BtShared *pBt = pCur->pBt; 004769 sqlite3BtreeEnter(pBtree); 004770 assert( pBt->pCursor!=0 ); 004771 if( pBt->pCursor==pCur ){ 004772 pBt->pCursor = pCur->pNext; 004773 }else{ 004774 BtCursor *pPrev = pBt->pCursor; 004775 do{ 004776 if( pPrev->pNext==pCur ){ 004777 pPrev->pNext = pCur->pNext; 004778 break; 004779 } 004780 pPrev = pPrev->pNext; 004781 }while( ALWAYS(pPrev) ); 004782 } 004783 btreeReleaseAllCursorPages(pCur); 004784 unlockBtreeIfUnused(pBt); 004785 sqlite3_free(pCur->aOverflow); 004786 sqlite3_free(pCur->pKey); 004787 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){ 004788 /* Since the BtShared is not sharable, there is no need to 004789 ** worry about the missing sqlite3BtreeLeave() call here. */ 004790 assert( pBtree->sharable==0 ); 004791 sqlite3BtreeClose(pBtree); 004792 }else{ 004793 sqlite3BtreeLeave(pBtree); 004794 } 004795 pCur->pBtree = 0; 004796 } 004797 return SQLITE_OK; 004798 } 004799 004800 /* 004801 ** Make sure the BtCursor* given in the argument has a valid 004802 ** BtCursor.info structure. If it is not already valid, call 004803 ** btreeParseCell() to fill it in. 004804 ** 004805 ** BtCursor.info is a cache of the information in the current cell. 004806 ** Using this cache reduces the number of calls to btreeParseCell(). 004807 */ 004808 #ifndef NDEBUG 004809 static int cellInfoEqual(CellInfo *a, CellInfo *b){ 004810 if( a->nKey!=b->nKey ) return 0; 004811 if( a->pPayload!=b->pPayload ) return 0; 004812 if( a->nPayload!=b->nPayload ) return 0; 004813 if( a->nLocal!=b->nLocal ) return 0; 004814 if( a->nSize!=b->nSize ) return 0; 004815 return 1; 004816 } 004817 static void assertCellInfo(BtCursor *pCur){ 004818 CellInfo info; 004819 memset(&info, 0, sizeof(info)); 004820 btreeParseCell(pCur->pPage, pCur->ix, &info); 004821 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) ); 004822 } 004823 #else 004824 #define assertCellInfo(x) 004825 #endif 004826 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){ 004827 if( pCur->info.nSize==0 ){ 004828 pCur->curFlags |= BTCF_ValidNKey; 004829 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info); 004830 }else{ 004831 assertCellInfo(pCur); 004832 } 004833 } 004834 004835 #ifndef NDEBUG /* The next routine used only within assert() statements */ 004836 /* 004837 ** Return true if the given BtCursor is valid. A valid cursor is one 004838 ** that is currently pointing to a row in a (non-empty) table. 004839 ** This is a verification routine is used only within assert() statements. 004840 */ 004841 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ 004842 return pCur && pCur->eState==CURSOR_VALID; 004843 } 004844 #endif /* NDEBUG */ 004845 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){ 004846 assert( pCur!=0 ); 004847 return pCur->eState==CURSOR_VALID; 004848 } 004849 004850 /* 004851 ** Return the value of the integer key or "rowid" for a table btree. 004852 ** This routine is only valid for a cursor that is pointing into a 004853 ** ordinary table btree. If the cursor points to an index btree or 004854 ** is invalid, the result of this routine is undefined. 004855 */ 004856 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){ 004857 assert( cursorHoldsMutex(pCur) ); 004858 assert( pCur->eState==CURSOR_VALID ); 004859 assert( pCur->curIntKey ); 004860 getCellInfo(pCur); 004861 return pCur->info.nKey; 004862 } 004863 004864 /* 004865 ** Pin or unpin a cursor. 004866 */ 004867 void sqlite3BtreeCursorPin(BtCursor *pCur){ 004868 assert( (pCur->curFlags & BTCF_Pinned)==0 ); 004869 pCur->curFlags |= BTCF_Pinned; 004870 } 004871 void sqlite3BtreeCursorUnpin(BtCursor *pCur){ 004872 assert( (pCur->curFlags & BTCF_Pinned)!=0 ); 004873 pCur->curFlags &= ~BTCF_Pinned; 004874 } 004875 004876 /* 004877 ** Return the offset into the database file for the start of the 004878 ** payload to which the cursor is pointing. 004879 */ 004880 i64 sqlite3BtreeOffset(BtCursor *pCur){ 004881 assert( cursorHoldsMutex(pCur) ); 004882 assert( pCur->eState==CURSOR_VALID ); 004883 getCellInfo(pCur); 004884 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) + 004885 (i64)(pCur->info.pPayload - pCur->pPage->aData); 004886 } 004887 004888 /* 004889 ** Return the number of bytes of payload for the entry that pCur is 004890 ** currently pointing to. For table btrees, this will be the amount 004891 ** of data. For index btrees, this will be the size of the key. 004892 ** 004893 ** The caller must guarantee that the cursor is pointing to a non-NULL 004894 ** valid entry. In other words, the calling procedure must guarantee 004895 ** that the cursor has Cursor.eState==CURSOR_VALID. 004896 */ 004897 u32 sqlite3BtreePayloadSize(BtCursor *pCur){ 004898 assert( cursorHoldsMutex(pCur) ); 004899 assert( pCur->eState==CURSOR_VALID ); 004900 getCellInfo(pCur); 004901 return pCur->info.nPayload; 004902 } 004903 004904 /* 004905 ** Return an upper bound on the size of any record for the table 004906 ** that the cursor is pointing into. 004907 ** 004908 ** This is an optimization. Everything will still work if this 004909 ** routine always returns 2147483647 (which is the largest record 004910 ** that SQLite can handle) or more. But returning a smaller value might 004911 ** prevent large memory allocations when trying to interpret a 004912 ** corrupt database. 004913 ** 004914 ** The current implementation merely returns the size of the underlying 004915 ** database file. 004916 */ 004917 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){ 004918 assert( cursorHoldsMutex(pCur) ); 004919 assert( pCur->eState==CURSOR_VALID ); 004920 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage; 004921 } 004922 004923 /* 004924 ** Given the page number of an overflow page in the database (parameter 004925 ** ovfl), this function finds the page number of the next page in the 004926 ** linked list of overflow pages. If possible, it uses the auto-vacuum 004927 ** pointer-map data instead of reading the content of page ovfl to do so. 004928 ** 004929 ** If an error occurs an SQLite error code is returned. Otherwise: 004930 ** 004931 ** The page number of the next overflow page in the linked list is 004932 ** written to *pPgnoNext. If page ovfl is the last page in its linked 004933 ** list, *pPgnoNext is set to zero. 004934 ** 004935 ** If ppPage is not NULL, and a reference to the MemPage object corresponding 004936 ** to page number pOvfl was obtained, then *ppPage is set to point to that 004937 ** reference. It is the responsibility of the caller to call releasePage() 004938 ** on *ppPage to free the reference. In no reference was obtained (because 004939 ** the pointer-map was used to obtain the value for *pPgnoNext), then 004940 ** *ppPage is set to zero. 004941 */ 004942 static int getOverflowPage( 004943 BtShared *pBt, /* The database file */ 004944 Pgno ovfl, /* Current overflow page number */ 004945 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */ 004946 Pgno *pPgnoNext /* OUT: Next overflow page number */ 004947 ){ 004948 Pgno next = 0; 004949 MemPage *pPage = 0; 004950 int rc = SQLITE_OK; 004951 004952 assert( sqlite3_mutex_held(pBt->mutex) ); 004953 assert(pPgnoNext); 004954 004955 #ifndef SQLITE_OMIT_AUTOVACUUM 004956 /* Try to find the next page in the overflow list using the 004957 ** autovacuum pointer-map pages. Guess that the next page in 004958 ** the overflow list is page number (ovfl+1). If that guess turns 004959 ** out to be wrong, fall back to loading the data of page 004960 ** number ovfl to determine the next page number. 004961 */ 004962 if( pBt->autoVacuum ){ 004963 Pgno pgno; 004964 Pgno iGuess = ovfl+1; 004965 u8 eType; 004966 004967 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ 004968 iGuess++; 004969 } 004970 004971 if( iGuess<=btreePagecount(pBt) ){ 004972 rc = ptrmapGet(pBt, iGuess, &eType, &pgno); 004973 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ 004974 next = iGuess; 004975 rc = SQLITE_DONE; 004976 } 004977 } 004978 } 004979 #endif 004980 004981 assert( next==0 || rc==SQLITE_DONE ); 004982 if( rc==SQLITE_OK ){ 004983 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0); 004984 assert( rc==SQLITE_OK || pPage==0 ); 004985 if( rc==SQLITE_OK ){ 004986 next = get4byte(pPage->aData); 004987 } 004988 } 004989 004990 *pPgnoNext = next; 004991 if( ppPage ){ 004992 *ppPage = pPage; 004993 }else{ 004994 releasePage(pPage); 004995 } 004996 return (rc==SQLITE_DONE ? SQLITE_OK : rc); 004997 } 004998 004999 /* 005000 ** Copy data from a buffer to a page, or from a page to a buffer. 005001 ** 005002 ** pPayload is a pointer to data stored on database page pDbPage. 005003 ** If argument eOp is false, then nByte bytes of data are copied 005004 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, 005005 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes 005006 ** of data are copied from the buffer pBuf to pPayload. 005007 ** 005008 ** SQLITE_OK is returned on success, otherwise an error code. 005009 */ 005010 static int copyPayload( 005011 void *pPayload, /* Pointer to page data */ 005012 void *pBuf, /* Pointer to buffer */ 005013 int nByte, /* Number of bytes to copy */ 005014 int eOp, /* 0 -> copy from page, 1 -> copy to page */ 005015 DbPage *pDbPage /* Page containing pPayload */ 005016 ){ 005017 if( eOp ){ 005018 /* Copy data from buffer to page (a write operation) */ 005019 int rc = sqlite3PagerWrite(pDbPage); 005020 if( rc!=SQLITE_OK ){ 005021 return rc; 005022 } 005023 memcpy(pPayload, pBuf, nByte); 005024 }else{ 005025 /* Copy data from page to buffer (a read operation) */ 005026 memcpy(pBuf, pPayload, nByte); 005027 } 005028 return SQLITE_OK; 005029 } 005030 005031 /* 005032 ** This function is used to read or overwrite payload information 005033 ** for the entry that the pCur cursor is pointing to. The eOp 005034 ** argument is interpreted as follows: 005035 ** 005036 ** 0: The operation is a read. Populate the overflow cache. 005037 ** 1: The operation is a write. Populate the overflow cache. 005038 ** 005039 ** A total of "amt" bytes are read or written beginning at "offset". 005040 ** Data is read to or from the buffer pBuf. 005041 ** 005042 ** The content being read or written might appear on the main page 005043 ** or be scattered out on multiple overflow pages. 005044 ** 005045 ** If the current cursor entry uses one or more overflow pages 005046 ** this function may allocate space for and lazily populate 005047 ** the overflow page-list cache array (BtCursor.aOverflow). 005048 ** Subsequent calls use this cache to make seeking to the supplied offset 005049 ** more efficient. 005050 ** 005051 ** Once an overflow page-list cache has been allocated, it must be 005052 ** invalidated if some other cursor writes to the same table, or if 005053 ** the cursor is moved to a different row. Additionally, in auto-vacuum 005054 ** mode, the following events may invalidate an overflow page-list cache. 005055 ** 005056 ** * An incremental vacuum, 005057 ** * A commit in auto_vacuum="full" mode, 005058 ** * Creating a table (may require moving an overflow page). 005059 */ 005060 static int accessPayload( 005061 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005062 u32 offset, /* Begin reading this far into payload */ 005063 u32 amt, /* Read this many bytes */ 005064 unsigned char *pBuf, /* Write the bytes into this buffer */ 005065 int eOp /* zero to read. non-zero to write. */ 005066 ){ 005067 unsigned char *aPayload; 005068 int rc = SQLITE_OK; 005069 int iIdx = 0; 005070 MemPage *pPage = pCur->pPage; /* Btree page of current entry */ 005071 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */ 005072 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005073 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */ 005074 #endif 005075 005076 assert( pPage ); 005077 assert( eOp==0 || eOp==1 ); 005078 assert( pCur->eState==CURSOR_VALID ); 005079 if( pCur->ix>=pPage->nCell ){ 005080 return SQLITE_CORRUPT_PAGE(pPage); 005081 } 005082 assert( cursorHoldsMutex(pCur) ); 005083 005084 getCellInfo(pCur); 005085 aPayload = pCur->info.pPayload; 005086 assert( offset+amt <= pCur->info.nPayload ); 005087 005088 assert( aPayload > pPage->aData ); 005089 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){ 005090 /* Trying to read or write past the end of the data is an error. The 005091 ** conditional above is really: 005092 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] 005093 ** but is recast into its current form to avoid integer overflow problems 005094 */ 005095 return SQLITE_CORRUPT_PAGE(pPage); 005096 } 005097 005098 /* Check if data must be read/written to/from the btree page itself. */ 005099 if( offset<pCur->info.nLocal ){ 005100 int a = amt; 005101 if( a+offset>pCur->info.nLocal ){ 005102 a = pCur->info.nLocal - offset; 005103 } 005104 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); 005105 offset = 0; 005106 pBuf += a; 005107 amt -= a; 005108 }else{ 005109 offset -= pCur->info.nLocal; 005110 } 005111 005112 005113 if( rc==SQLITE_OK && amt>0 ){ 005114 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */ 005115 Pgno nextPage; 005116 005117 nextPage = get4byte(&aPayload[pCur->info.nLocal]); 005118 005119 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now. 005120 ** 005121 ** The aOverflow[] array is sized at one entry for each overflow page 005122 ** in the overflow chain. The page number of the first overflow page is 005123 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array 005124 ** means "not yet known" (the cache is lazily populated). 005125 */ 005126 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){ 005127 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; 005128 if( pCur->aOverflow==0 005129 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow) 005130 ){ 005131 Pgno *aNew; 005132 if( sqlite3FaultSim(413) ){ 005133 aNew = 0; 005134 }else{ 005135 aNew = (Pgno*)sqlite3Realloc(pCur->aOverflow, nOvfl*2*sizeof(Pgno)); 005136 } 005137 if( aNew==0 ){ 005138 return SQLITE_NOMEM_BKPT; 005139 }else{ 005140 pCur->aOverflow = aNew; 005141 } 005142 } 005143 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno)); 005144 pCur->curFlags |= BTCF_ValidOvfl; 005145 }else{ 005146 /* Sanity check the validity of the overflow page cache */ 005147 assert( pCur->aOverflow[0]==nextPage 005148 || pCur->aOverflow[0]==0 005149 || CORRUPT_DB ); 005150 assert( pCur->aOverflow[0]!=0 || pCur->aOverflow[offset/ovflSize]==0 ); 005151 005152 /* If the overflow page-list cache has been allocated and the 005153 ** entry for the first required overflow page is valid, skip 005154 ** directly to it. 005155 */ 005156 if( pCur->aOverflow[offset/ovflSize] ){ 005157 iIdx = (offset/ovflSize); 005158 nextPage = pCur->aOverflow[iIdx]; 005159 offset = (offset%ovflSize); 005160 } 005161 } 005162 005163 assert( rc==SQLITE_OK && amt>0 ); 005164 while( nextPage ){ 005165 /* If required, populate the overflow page-list cache. */ 005166 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT; 005167 assert( pCur->aOverflow[iIdx]==0 005168 || pCur->aOverflow[iIdx]==nextPage 005169 || CORRUPT_DB ); 005170 pCur->aOverflow[iIdx] = nextPage; 005171 005172 if( offset>=ovflSize ){ 005173 /* The only reason to read this page is to obtain the page 005174 ** number for the next page in the overflow chain. The page 005175 ** data is not required. So first try to lookup the overflow 005176 ** page-list cache, if any, then fall back to the getOverflowPage() 005177 ** function. 005178 */ 005179 assert( pCur->curFlags & BTCF_ValidOvfl ); 005180 assert( pCur->pBtree->db==pBt->db ); 005181 if( pCur->aOverflow[iIdx+1] ){ 005182 nextPage = pCur->aOverflow[iIdx+1]; 005183 }else{ 005184 rc = getOverflowPage(pBt, nextPage, 0, &nextPage); 005185 } 005186 offset -= ovflSize; 005187 }else{ 005188 /* Need to read this page properly. It contains some of the 005189 ** range of data that is being read (eOp==0) or written (eOp!=0). 005190 */ 005191 int a = amt; 005192 if( a + offset > ovflSize ){ 005193 a = ovflSize - offset; 005194 } 005195 005196 #ifdef SQLITE_DIRECT_OVERFLOW_READ 005197 /* If all the following are true: 005198 ** 005199 ** 1) this is a read operation, and 005200 ** 2) data is required from the start of this overflow page, and 005201 ** 3) there are no dirty pages in the page-cache 005202 ** 4) the database is file-backed, and 005203 ** 5) the page is not in the WAL file 005204 ** 6) at least 4 bytes have already been read into the output buffer 005205 ** 005206 ** then data can be read directly from the database file into the 005207 ** output buffer, bypassing the page-cache altogether. This speeds 005208 ** up loading large records that span many overflow pages. 005209 */ 005210 if( eOp==0 /* (1) */ 005211 && offset==0 /* (2) */ 005212 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */ 005213 && &pBuf[-4]>=pBufStart /* (6) */ 005214 ){ 005215 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager); 005216 u8 aSave[4]; 005217 u8 *aWrite = &pBuf[-4]; 005218 assert( aWrite>=pBufStart ); /* due to (6) */ 005219 memcpy(aSave, aWrite, 4); 005220 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 005221 nextPage = get4byte(aWrite); 005222 memcpy(aWrite, aSave, 4); 005223 }else 005224 #endif 005225 005226 { 005227 DbPage *pDbPage; 005228 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage, 005229 (eOp==0 ? PAGER_GET_READONLY : 0) 005230 ); 005231 if( rc==SQLITE_OK ){ 005232 aPayload = sqlite3PagerGetData(pDbPage); 005233 nextPage = get4byte(aPayload); 005234 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); 005235 sqlite3PagerUnref(pDbPage); 005236 offset = 0; 005237 } 005238 } 005239 amt -= a; 005240 if( amt==0 ) return rc; 005241 pBuf += a; 005242 } 005243 if( rc ) break; 005244 iIdx++; 005245 } 005246 } 005247 005248 if( rc==SQLITE_OK && amt>0 ){ 005249 /* Overflow chain ends prematurely */ 005250 return SQLITE_CORRUPT_PAGE(pPage); 005251 } 005252 return rc; 005253 } 005254 005255 /* 005256 ** Read part of the payload for the row at which that cursor pCur is currently 005257 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer 005258 ** begins at "offset". 005259 ** 005260 ** pCur can be pointing to either a table or an index b-tree. 005261 ** If pointing to a table btree, then the content section is read. If 005262 ** pCur is pointing to an index b-tree then the key section is read. 005263 ** 005264 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing 005265 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the 005266 ** cursor might be invalid or might need to be restored before being read. 005267 ** 005268 ** Return SQLITE_OK on success or an error code if anything goes 005269 ** wrong. An error is returned if "offset+amt" is larger than 005270 ** the available payload. 005271 */ 005272 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005273 assert( cursorHoldsMutex(pCur) ); 005274 assert( pCur->eState==CURSOR_VALID ); 005275 assert( pCur->iPage>=0 && pCur->pPage ); 005276 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 005277 } 005278 005279 /* 005280 ** This variant of sqlite3BtreePayload() works even if the cursor has not 005281 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read() 005282 ** interface. 005283 */ 005284 #ifndef SQLITE_OMIT_INCRBLOB 005285 static SQLITE_NOINLINE int accessPayloadChecked( 005286 BtCursor *pCur, 005287 u32 offset, 005288 u32 amt, 005289 void *pBuf 005290 ){ 005291 int rc; 005292 if ( pCur->eState==CURSOR_INVALID ){ 005293 return SQLITE_ABORT; 005294 } 005295 assert( cursorOwnsBtShared(pCur) ); 005296 rc = btreeRestoreCursorPosition(pCur); 005297 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0); 005298 } 005299 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 005300 if( pCur->eState==CURSOR_VALID ){ 005301 assert( cursorOwnsBtShared(pCur) ); 005302 return accessPayload(pCur, offset, amt, pBuf, 0); 005303 }else{ 005304 return accessPayloadChecked(pCur, offset, amt, pBuf); 005305 } 005306 } 005307 #endif /* SQLITE_OMIT_INCRBLOB */ 005308 005309 /* 005310 ** Return a pointer to payload information from the entry that the 005311 ** pCur cursor is pointing to. The pointer is to the beginning of 005312 ** the key if index btrees (pPage->intKey==0) and is the data for 005313 ** table btrees (pPage->intKey==1). The number of bytes of available 005314 ** key/data is written into *pAmt. If *pAmt==0, then the value 005315 ** returned will not be a valid pointer. 005316 ** 005317 ** This routine is an optimization. It is common for the entire key 005318 ** and data to fit on the local page and for there to be no overflow 005319 ** pages. When that is so, this routine can be used to access the 005320 ** key and data without making a copy. If the key and/or data spills 005321 ** onto overflow pages, then accessPayload() must be used to reassemble 005322 ** the key/data and copy it into a preallocated buffer. 005323 ** 005324 ** The pointer returned by this routine looks directly into the cached 005325 ** page of the database. The data might change or move the next time 005326 ** any btree routine is called. 005327 */ 005328 static const void *fetchPayload( 005329 BtCursor *pCur, /* Cursor pointing to entry to read from */ 005330 u32 *pAmt /* Write the number of available bytes here */ 005331 ){ 005332 int amt; 005333 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage); 005334 assert( pCur->eState==CURSOR_VALID ); 005335 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005336 assert( cursorOwnsBtShared(pCur) ); 005337 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 005338 assert( pCur->info.nSize>0 ); 005339 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB ); 005340 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB); 005341 amt = pCur->info.nLocal; 005342 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){ 005343 /* There is too little space on the page for the expected amount 005344 ** of local content. Database must be corrupt. */ 005345 assert( CORRUPT_DB ); 005346 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload)); 005347 } 005348 *pAmt = (u32)amt; 005349 return (void*)pCur->info.pPayload; 005350 } 005351 005352 005353 /* 005354 ** For the entry that cursor pCur is point to, return as 005355 ** many bytes of the key or data as are available on the local 005356 ** b-tree page. Write the number of available bytes into *pAmt. 005357 ** 005358 ** The pointer returned is ephemeral. The key/data may move 005359 ** or be destroyed on the next call to any Btree routine, 005360 ** including calls from other threads against the same cache. 005361 ** Hence, a mutex on the BtShared should be held prior to calling 005362 ** this routine. 005363 ** 005364 ** These routines is used to get quick access to key and data 005365 ** in the common case where no overflow pages are used. 005366 */ 005367 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){ 005368 return fetchPayload(pCur, pAmt); 005369 } 005370 005371 005372 /* 005373 ** Move the cursor down to a new child page. The newPgno argument is the 005374 ** page number of the child page to move to. 005375 ** 005376 ** This function returns SQLITE_CORRUPT if the page-header flags field of 005377 ** the new child page does not match the flags field of the parent (i.e. 005378 ** if an intkey page appears to be the parent of a non-intkey page, or 005379 ** vice-versa). 005380 */ 005381 static int moveToChild(BtCursor *pCur, u32 newPgno){ 005382 int rc; 005383 assert( cursorOwnsBtShared(pCur) ); 005384 assert( pCur->eState==CURSOR_VALID ); 005385 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 005386 assert( pCur->iPage>=0 ); 005387 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 005388 return SQLITE_CORRUPT_BKPT; 005389 } 005390 pCur->info.nSize = 0; 005391 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005392 pCur->aiIdx[pCur->iPage] = pCur->ix; 005393 pCur->apPage[pCur->iPage] = pCur->pPage; 005394 pCur->ix = 0; 005395 pCur->iPage++; 005396 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags); 005397 assert( pCur->pPage!=0 || rc!=SQLITE_OK ); 005398 if( rc==SQLITE_OK 005399 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 005400 ){ 005401 releasePage(pCur->pPage); 005402 rc = SQLITE_CORRUPT_PGNO(newPgno); 005403 } 005404 if( rc ){ 005405 pCur->pPage = pCur->apPage[--pCur->iPage]; 005406 } 005407 return rc; 005408 } 005409 005410 #ifdef SQLITE_DEBUG 005411 /* 005412 ** Page pParent is an internal (non-leaf) tree page. This function 005413 ** asserts that page number iChild is the left-child if the iIdx'th 005414 ** cell in page pParent. Or, if iIdx is equal to the total number of 005415 ** cells in pParent, that page number iChild is the right-child of 005416 ** the page. 005417 */ 005418 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 005419 if( CORRUPT_DB ) return; /* The conditions tested below might not be true 005420 ** in a corrupt database */ 005421 assert( iIdx<=pParent->nCell ); 005422 if( iIdx==pParent->nCell ){ 005423 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 005424 }else{ 005425 assert( get4byte(findCell(pParent, iIdx))==iChild ); 005426 } 005427 } 005428 #else 005429 # define assertParentIndex(x,y,z) 005430 #endif 005431 005432 /* 005433 ** Move the cursor up to the parent page. 005434 ** 005435 ** pCur->idx is set to the cell index that contains the pointer 005436 ** to the page we are coming from. If we are coming from the 005437 ** right-most child page then pCur->idx is set to one more than 005438 ** the largest cell index. 005439 */ 005440 static void moveToParent(BtCursor *pCur){ 005441 MemPage *pLeaf; 005442 assert( cursorOwnsBtShared(pCur) ); 005443 assert( pCur->eState==CURSOR_VALID ); 005444 assert( pCur->iPage>0 ); 005445 assert( pCur->pPage ); 005446 assertParentIndex( 005447 pCur->apPage[pCur->iPage-1], 005448 pCur->aiIdx[pCur->iPage-1], 005449 pCur->pPage->pgno 005450 ); 005451 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 005452 pCur->info.nSize = 0; 005453 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 005454 pCur->ix = pCur->aiIdx[pCur->iPage-1]; 005455 pLeaf = pCur->pPage; 005456 pCur->pPage = pCur->apPage[--pCur->iPage]; 005457 releasePageNotNull(pLeaf); 005458 } 005459 005460 /* 005461 ** Move the cursor to point to the root page of its b-tree structure. 005462 ** 005463 ** If the table has a virtual root page, then the cursor is moved to point 005464 ** to the virtual root page instead of the actual root page. A table has a 005465 ** virtual root page when the actual root page contains no cells and a 005466 ** single child page. This can only happen with the table rooted at page 1. 005467 ** 005468 ** If the b-tree structure is empty, the cursor state is set to 005469 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise, 005470 ** the cursor is set to point to the first cell located on the root 005471 ** (or virtual root) page and the cursor state is set to CURSOR_VALID. 005472 ** 005473 ** If this function returns successfully, it may be assumed that the 005474 ** page-header flags indicate that the [virtual] root-page is the expected 005475 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 005476 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 005477 ** indicating a table b-tree, or if the caller did specify a KeyInfo 005478 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 005479 ** b-tree). 005480 */ 005481 static int moveToRoot(BtCursor *pCur){ 005482 MemPage *pRoot; 005483 int rc = SQLITE_OK; 005484 005485 assert( cursorOwnsBtShared(pCur) ); 005486 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 005487 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 005488 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 005489 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 ); 005490 assert( pCur->pgnoRoot>0 || pCur->iPage<0 ); 005491 005492 if( pCur->iPage>=0 ){ 005493 if( pCur->iPage ){ 005494 releasePageNotNull(pCur->pPage); 005495 while( --pCur->iPage ){ 005496 releasePageNotNull(pCur->apPage[pCur->iPage]); 005497 } 005498 pRoot = pCur->pPage = pCur->apPage[0]; 005499 goto skip_init; 005500 } 005501 }else if( pCur->pgnoRoot==0 ){ 005502 pCur->eState = CURSOR_INVALID; 005503 return SQLITE_EMPTY; 005504 }else{ 005505 assert( pCur->iPage==(-1) ); 005506 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 005507 if( pCur->eState==CURSOR_FAULT ){ 005508 assert( pCur->skipNext!=SQLITE_OK ); 005509 return pCur->skipNext; 005510 } 005511 sqlite3BtreeClearCursor(pCur); 005512 } 005513 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, 005514 pCur->curPagerFlags); 005515 if( rc!=SQLITE_OK ){ 005516 pCur->eState = CURSOR_INVALID; 005517 return rc; 005518 } 005519 pCur->iPage = 0; 005520 pCur->curIntKey = pCur->pPage->intKey; 005521 } 005522 pRoot = pCur->pPage; 005523 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB ); 005524 005525 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 005526 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 005527 ** NULL, the caller expects a table b-tree. If this is not the case, 005528 ** return an SQLITE_CORRUPT error. 005529 ** 005530 ** Earlier versions of SQLite assumed that this test could not fail 005531 ** if the root page was already loaded when this function was called (i.e. 005532 ** if pCur->iPage>=0). But this is not so if the database is corrupted 005533 ** in such a way that page pRoot is linked into a second b-tree table 005534 ** (or the freelist). */ 005535 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 005536 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 005537 return SQLITE_CORRUPT_PAGE(pCur->pPage); 005538 } 005539 005540 skip_init: 005541 pCur->ix = 0; 005542 pCur->info.nSize = 0; 005543 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 005544 005545 if( pRoot->nCell>0 ){ 005546 pCur->eState = CURSOR_VALID; 005547 }else if( !pRoot->leaf ){ 005548 Pgno subpage; 005549 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 005550 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 005551 pCur->eState = CURSOR_VALID; 005552 rc = moveToChild(pCur, subpage); 005553 }else{ 005554 pCur->eState = CURSOR_INVALID; 005555 rc = SQLITE_EMPTY; 005556 } 005557 return rc; 005558 } 005559 005560 /* 005561 ** Move the cursor down to the left-most leaf entry beneath the 005562 ** entry to which it is currently pointing. 005563 ** 005564 ** The left-most leaf is the one with the smallest key - the first 005565 ** in ascending order. 005566 */ 005567 static int moveToLeftmost(BtCursor *pCur){ 005568 Pgno pgno; 005569 int rc = SQLITE_OK; 005570 MemPage *pPage; 005571 005572 assert( cursorOwnsBtShared(pCur) ); 005573 assert( pCur->eState==CURSOR_VALID ); 005574 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){ 005575 assert( pCur->ix<pPage->nCell ); 005576 pgno = get4byte(findCell(pPage, pCur->ix)); 005577 rc = moveToChild(pCur, pgno); 005578 } 005579 return rc; 005580 } 005581 005582 /* 005583 ** Move the cursor down to the right-most leaf entry beneath the 005584 ** page to which it is currently pointing. Notice the difference 005585 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 005586 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 005587 ** finds the right-most entry beneath the *page*. 005588 ** 005589 ** The right-most entry is the one with the largest key - the last 005590 ** key in ascending order. 005591 */ 005592 static int moveToRightmost(BtCursor *pCur){ 005593 Pgno pgno; 005594 int rc = SQLITE_OK; 005595 MemPage *pPage = 0; 005596 005597 assert( cursorOwnsBtShared(pCur) ); 005598 assert( pCur->eState==CURSOR_VALID ); 005599 while( !(pPage = pCur->pPage)->leaf ){ 005600 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005601 pCur->ix = pPage->nCell; 005602 rc = moveToChild(pCur, pgno); 005603 if( rc ) return rc; 005604 } 005605 pCur->ix = pPage->nCell-1; 005606 assert( pCur->info.nSize==0 ); 005607 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 005608 return SQLITE_OK; 005609 } 005610 005611 /* Move the cursor to the first entry in the table. Return SQLITE_OK 005612 ** on success. Set *pRes to 0 if the cursor actually points to something 005613 ** or set *pRes to 1 if the table is empty. 005614 */ 005615 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 005616 int rc; 005617 005618 assert( cursorOwnsBtShared(pCur) ); 005619 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005620 rc = moveToRoot(pCur); 005621 if( rc==SQLITE_OK ){ 005622 assert( pCur->pPage->nCell>0 ); 005623 *pRes = 0; 005624 rc = moveToLeftmost(pCur); 005625 }else if( rc==SQLITE_EMPTY ){ 005626 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) ); 005627 *pRes = 1; 005628 rc = SQLITE_OK; 005629 } 005630 return rc; 005631 } 005632 005633 #ifdef SQLITE_DEBUG 005634 /* The cursors is CURSOR_VALID and has BTCF_AtLast set. Verify that 005635 ** this flags are true for a consistent database. 005636 ** 005637 ** This routine is is called from within assert() statements only. 005638 ** It is an internal verification routine and does not appear in production 005639 ** builds. 005640 */ 005641 static int cursorIsAtLastEntry(BtCursor *pCur){ 005642 int ii; 005643 for(ii=0; ii<pCur->iPage; ii++){ 005644 if( pCur->aiIdx[ii]!=pCur->apPage[ii]->nCell ) return 0; 005645 } 005646 return pCur->ix==pCur->pPage->nCell-1 && pCur->pPage->leaf!=0; 005647 } 005648 #endif 005649 005650 /* Move the cursor to the last entry in the table. Return SQLITE_OK 005651 ** on success. Set *pRes to 0 if the cursor actually points to something 005652 ** or set *pRes to 1 if the table is empty. 005653 */ 005654 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){ 005655 int rc = moveToRoot(pCur); 005656 if( rc==SQLITE_OK ){ 005657 assert( pCur->eState==CURSOR_VALID ); 005658 *pRes = 0; 005659 rc = moveToRightmost(pCur); 005660 if( rc==SQLITE_OK ){ 005661 pCur->curFlags |= BTCF_AtLast; 005662 }else{ 005663 pCur->curFlags &= ~BTCF_AtLast; 005664 } 005665 }else if( rc==SQLITE_EMPTY ){ 005666 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005667 *pRes = 1; 005668 rc = SQLITE_OK; 005669 } 005670 return rc; 005671 } 005672 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 005673 assert( cursorOwnsBtShared(pCur) ); 005674 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005675 005676 /* If the cursor already points to the last entry, this is a no-op. */ 005677 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 005678 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB ); 005679 *pRes = 0; 005680 return SQLITE_OK; 005681 } 005682 return btreeLast(pCur, pRes); 005683 } 005684 005685 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY) 005686 ** table near the key intKey. Return a success code. 005687 ** 005688 ** If an exact match is not found, then the cursor is always 005689 ** left pointing at a leaf page which would hold the entry if it 005690 ** were present. The cursor might point to an entry that comes 005691 ** before or after the key. 005692 ** 005693 ** An integer is written into *pRes which is the result of 005694 ** comparing the key with the entry to which the cursor is 005695 ** pointing. The meaning of the integer written into 005696 ** *pRes is as follows: 005697 ** 005698 ** *pRes<0 The cursor is left pointing at an entry that 005699 ** is smaller than intKey or if the table is empty 005700 ** and the cursor is therefore left point to nothing. 005701 ** 005702 ** *pRes==0 The cursor is left pointing at an entry that 005703 ** exactly matches intKey. 005704 ** 005705 ** *pRes>0 The cursor is left pointing at an entry that 005706 ** is larger than intKey. 005707 */ 005708 int sqlite3BtreeTableMoveto( 005709 BtCursor *pCur, /* The cursor to be moved */ 005710 i64 intKey, /* The table key */ 005711 int biasRight, /* If true, bias the search to the high end */ 005712 int *pRes /* Write search results here */ 005713 ){ 005714 int rc; 005715 005716 assert( cursorOwnsBtShared(pCur) ); 005717 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005718 assert( pRes ); 005719 assert( pCur->pKeyInfo==0 ); 005720 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 ); 005721 005722 /* If the cursor is already positioned at the point we are trying 005723 ** to move to, then just return without doing any work */ 005724 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){ 005725 if( pCur->info.nKey==intKey ){ 005726 *pRes = 0; 005727 return SQLITE_OK; 005728 } 005729 if( pCur->info.nKey<intKey ){ 005730 if( (pCur->curFlags & BTCF_AtLast)!=0 ){ 005731 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB ); 005732 *pRes = -1; 005733 return SQLITE_OK; 005734 } 005735 /* If the requested key is one more than the previous key, then 005736 ** try to get there using sqlite3BtreeNext() rather than a full 005737 ** binary search. This is an optimization only. The correct answer 005738 ** is still obtained without this case, only a little more slowly. */ 005739 if( pCur->info.nKey+1==intKey ){ 005740 *pRes = 0; 005741 rc = sqlite3BtreeNext(pCur, 0); 005742 if( rc==SQLITE_OK ){ 005743 getCellInfo(pCur); 005744 if( pCur->info.nKey==intKey ){ 005745 return SQLITE_OK; 005746 } 005747 }else if( rc!=SQLITE_DONE ){ 005748 return rc; 005749 } 005750 } 005751 } 005752 } 005753 005754 #ifdef SQLITE_DEBUG 005755 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005756 #endif 005757 005758 rc = moveToRoot(pCur); 005759 if( rc ){ 005760 if( rc==SQLITE_EMPTY ){ 005761 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 005762 *pRes = -1; 005763 return SQLITE_OK; 005764 } 005765 return rc; 005766 } 005767 assert( pCur->pPage ); 005768 assert( pCur->pPage->isInit ); 005769 assert( pCur->eState==CURSOR_VALID ); 005770 assert( pCur->pPage->nCell > 0 ); 005771 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); 005772 assert( pCur->curIntKey ); 005773 005774 for(;;){ 005775 int lwr, upr, idx, c; 005776 Pgno chldPg; 005777 MemPage *pPage = pCur->pPage; 005778 u8 *pCell; /* Pointer to current cell in pPage */ 005779 005780 /* pPage->nCell must be greater than zero. If this is the root-page 005781 ** the cursor would have been INVALID above and this for(;;) loop 005782 ** not run. If this is not the root-page, then the moveToChild() routine 005783 ** would have already detected db corruption. Similarly, pPage must 005784 ** be the right kind (index or table) of b-tree page. Otherwise 005785 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 005786 assert( pPage->nCell>0 ); 005787 assert( pPage->intKey ); 005788 lwr = 0; 005789 upr = pPage->nCell-1; 005790 assert( biasRight==0 || biasRight==1 ); 005791 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 005792 for(;;){ 005793 i64 nCellKey; 005794 pCell = findCellPastPtr(pPage, idx); 005795 if( pPage->intKeyLeaf ){ 005796 while( 0x80 <= *(pCell++) ){ 005797 if( pCell>=pPage->aDataEnd ){ 005798 return SQLITE_CORRUPT_PAGE(pPage); 005799 } 005800 } 005801 } 005802 getVarint(pCell, (u64*)&nCellKey); 005803 if( nCellKey<intKey ){ 005804 lwr = idx+1; 005805 if( lwr>upr ){ c = -1; break; } 005806 }else if( nCellKey>intKey ){ 005807 upr = idx-1; 005808 if( lwr>upr ){ c = +1; break; } 005809 }else{ 005810 assert( nCellKey==intKey ); 005811 pCur->ix = (u16)idx; 005812 if( !pPage->leaf ){ 005813 lwr = idx; 005814 goto moveto_table_next_layer; 005815 }else{ 005816 pCur->curFlags |= BTCF_ValidNKey; 005817 pCur->info.nKey = nCellKey; 005818 pCur->info.nSize = 0; 005819 *pRes = 0; 005820 return SQLITE_OK; 005821 } 005822 } 005823 assert( lwr+upr>=0 ); 005824 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 005825 } 005826 assert( lwr==upr+1 || !pPage->leaf ); 005827 assert( pPage->isInit ); 005828 if( pPage->leaf ){ 005829 assert( pCur->ix<pCur->pPage->nCell ); 005830 pCur->ix = (u16)idx; 005831 *pRes = c; 005832 rc = SQLITE_OK; 005833 goto moveto_table_finish; 005834 } 005835 moveto_table_next_layer: 005836 if( lwr>=pPage->nCell ){ 005837 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 005838 }else{ 005839 chldPg = get4byte(findCell(pPage, lwr)); 005840 } 005841 pCur->ix = (u16)lwr; 005842 rc = moveToChild(pCur, chldPg); 005843 if( rc ) break; 005844 } 005845 moveto_table_finish: 005846 pCur->info.nSize = 0; 005847 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 005848 return rc; 005849 } 005850 005851 /* 005852 ** Compare the "idx"-th cell on the page the cursor pCur is currently 005853 ** pointing to to pIdxKey using xRecordCompare. Return negative or 005854 ** zero if the cell is less than or equal pIdxKey. Return positive 005855 ** if unknown. 005856 ** 005857 ** Return value negative: Cell at pCur[idx] less than pIdxKey 005858 ** 005859 ** Return value is zero: Cell at pCur[idx] equals pIdxKey 005860 ** 005861 ** Return value positive: Nothing is known about the relationship 005862 ** of the cell at pCur[idx] and pIdxKey. 005863 ** 005864 ** This routine is part of an optimization. It is always safe to return 005865 ** a positive value as that will cause the optimization to be skipped. 005866 */ 005867 static int indexCellCompare( 005868 BtCursor *pCur, 005869 int idx, 005870 UnpackedRecord *pIdxKey, 005871 RecordCompare xRecordCompare 005872 ){ 005873 MemPage *pPage = pCur->pPage; 005874 int c; 005875 int nCell; /* Size of the pCell cell in bytes */ 005876 u8 *pCell = findCellPastPtr(pPage, idx); 005877 005878 nCell = pCell[0]; 005879 if( nCell<=pPage->max1bytePayload ){ 005880 /* This branch runs if the record-size field of the cell is a 005881 ** single byte varint and the record fits entirely on the main 005882 ** b-tree page. */ 005883 testcase( pCell+nCell+1==pPage->aDataEnd ); 005884 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 005885 }else if( !(pCell[1] & 0x80) 005886 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 005887 ){ 005888 /* The record-size field is a 2 byte varint and the record 005889 ** fits entirely on the main b-tree page. */ 005890 testcase( pCell+nCell+2==pPage->aDataEnd ); 005891 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 005892 }else{ 005893 /* If the record extends into overflow pages, do not attempt 005894 ** the optimization. */ 005895 c = 99; 005896 } 005897 return c; 005898 } 005899 005900 /* 005901 ** Return true (non-zero) if pCur is current pointing to the last 005902 ** page of a table. 005903 */ 005904 static int cursorOnLastPage(BtCursor *pCur){ 005905 int i; 005906 assert( pCur->eState==CURSOR_VALID ); 005907 for(i=0; i<pCur->iPage; i++){ 005908 MemPage *pPage = pCur->apPage[i]; 005909 if( pCur->aiIdx[i]<pPage->nCell ) return 0; 005910 } 005911 return 1; 005912 } 005913 005914 /* Move the cursor so that it points to an entry in an index table 005915 ** near the key pIdxKey. Return a success code. 005916 ** 005917 ** If an exact match is not found, then the cursor is always 005918 ** left pointing at a leaf page which would hold the entry if it 005919 ** were present. The cursor might point to an entry that comes 005920 ** before or after the key. 005921 ** 005922 ** An integer is written into *pRes which is the result of 005923 ** comparing the key with the entry to which the cursor is 005924 ** pointing. The meaning of the integer written into 005925 ** *pRes is as follows: 005926 ** 005927 ** *pRes<0 The cursor is left pointing at an entry that 005928 ** is smaller than pIdxKey or if the table is empty 005929 ** and the cursor is therefore left point to nothing. 005930 ** 005931 ** *pRes==0 The cursor is left pointing at an entry that 005932 ** exactly matches pIdxKey. 005933 ** 005934 ** *pRes>0 The cursor is left pointing at an entry that 005935 ** is larger than pIdxKey. 005936 ** 005937 ** The pIdxKey->eqSeen field is set to 1 if there 005938 ** exists an entry in the table that exactly matches pIdxKey. 005939 */ 005940 int sqlite3BtreeIndexMoveto( 005941 BtCursor *pCur, /* The cursor to be moved */ 005942 UnpackedRecord *pIdxKey, /* Unpacked index key */ 005943 int *pRes /* Write search results here */ 005944 ){ 005945 int rc; 005946 RecordCompare xRecordCompare; 005947 005948 assert( cursorOwnsBtShared(pCur) ); 005949 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 005950 assert( pRes ); 005951 assert( pCur->pKeyInfo!=0 ); 005952 005953 #ifdef SQLITE_DEBUG 005954 pCur->pBtree->nSeek++; /* Performance measurement during testing */ 005955 #endif 005956 005957 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 005958 pIdxKey->errCode = 0; 005959 assert( pIdxKey->default_rc==1 005960 || pIdxKey->default_rc==0 005961 || pIdxKey->default_rc==-1 005962 ); 005963 005964 005965 /* Check to see if we can skip a lot of work. Two cases: 005966 ** 005967 ** (1) If the cursor is already pointing to the very last cell 005968 ** in the table and the pIdxKey search key is greater than or 005969 ** equal to that last cell, then no movement is required. 005970 ** 005971 ** (2) If the cursor is on the last page of the table and the first 005972 ** cell on that last page is less than or equal to the pIdxKey 005973 ** search key, then we can start the search on the current page 005974 ** without needing to go back to root. 005975 */ 005976 if( pCur->eState==CURSOR_VALID 005977 && pCur->pPage->leaf 005978 && cursorOnLastPage(pCur) 005979 ){ 005980 int c; 005981 if( pCur->ix==pCur->pPage->nCell-1 005982 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 005983 && pIdxKey->errCode==SQLITE_OK 005984 ){ 005985 *pRes = c; 005986 return SQLITE_OK; /* Cursor already pointing at the correct spot */ 005987 } 005988 if( pCur->iPage>0 005989 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 005990 && pIdxKey->errCode==SQLITE_OK 005991 ){ 005992 pCur->curFlags &= ~BTCF_ValidOvfl; 005993 if( !pCur->pPage->isInit ){ 005994 return SQLITE_CORRUPT_BKPT; 005995 } 005996 goto bypass_moveto_root; /* Start search on the current page */ 005997 } 005998 pIdxKey->errCode = SQLITE_OK; 005999 } 006000 006001 rc = moveToRoot(pCur); 006002 if( rc ){ 006003 if( rc==SQLITE_EMPTY ){ 006004 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 ); 006005 *pRes = -1; 006006 return SQLITE_OK; 006007 } 006008 return rc; 006009 } 006010 006011 bypass_moveto_root: 006012 assert( pCur->pPage ); 006013 assert( pCur->pPage->isInit ); 006014 assert( pCur->eState==CURSOR_VALID ); 006015 assert( pCur->pPage->nCell > 0 ); 006016 assert( pCur->curIntKey==0 ); 006017 assert( pIdxKey!=0 ); 006018 for(;;){ 006019 int lwr, upr, idx, c; 006020 Pgno chldPg; 006021 MemPage *pPage = pCur->pPage; 006022 u8 *pCell; /* Pointer to current cell in pPage */ 006023 006024 /* pPage->nCell must be greater than zero. If this is the root-page 006025 ** the cursor would have been INVALID above and this for(;;) loop 006026 ** not run. If this is not the root-page, then the moveToChild() routine 006027 ** would have already detected db corruption. Similarly, pPage must 006028 ** be the right kind (index or table) of b-tree page. Otherwise 006029 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 006030 assert( pPage->nCell>0 ); 006031 assert( pPage->intKey==0 ); 006032 lwr = 0; 006033 upr = pPage->nCell-1; 006034 idx = upr>>1; /* idx = (lwr+upr)/2; */ 006035 for(;;){ 006036 int nCell; /* Size of the pCell cell in bytes */ 006037 pCell = findCellPastPtr(pPage, idx); 006038 006039 /* The maximum supported page-size is 65536 bytes. This means that 006040 ** the maximum number of record bytes stored on an index B-Tree 006041 ** page is less than 16384 bytes and may be stored as a 2-byte 006042 ** varint. This information is used to attempt to avoid parsing 006043 ** the entire cell by checking for the cases where the record is 006044 ** stored entirely within the b-tree page by inspecting the first 006045 ** 2 bytes of the cell. 006046 */ 006047 nCell = pCell[0]; 006048 if( nCell<=pPage->max1bytePayload ){ 006049 /* This branch runs if the record-size field of the cell is a 006050 ** single byte varint and the record fits entirely on the main 006051 ** b-tree page. */ 006052 testcase( pCell+nCell+1==pPage->aDataEnd ); 006053 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 006054 }else if( !(pCell[1] & 0x80) 006055 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 006056 ){ 006057 /* The record-size field is a 2 byte varint and the record 006058 ** fits entirely on the main b-tree page. */ 006059 testcase( pCell+nCell+2==pPage->aDataEnd ); 006060 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 006061 }else{ 006062 /* The record flows over onto one or more overflow pages. In 006063 ** this case the whole cell needs to be parsed, a buffer allocated 006064 ** and accessPayload() used to retrieve the record into the 006065 ** buffer before VdbeRecordCompare() can be called. 006066 ** 006067 ** If the record is corrupt, the xRecordCompare routine may read 006068 ** up to two varints past the end of the buffer. An extra 18 006069 ** bytes of padding is allocated at the end of the buffer in 006070 ** case this happens. */ 006071 void *pCellKey; 006072 u8 * const pCellBody = pCell - pPage->childPtrSize; 006073 const int nOverrun = 18; /* Size of the overrun padding */ 006074 pPage->xParseCell(pPage, pCellBody, &pCur->info); 006075 nCell = (int)pCur->info.nKey; 006076 testcase( nCell<0 ); /* True if key size is 2^32 or more */ 006077 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */ 006078 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */ 006079 testcase( nCell==2 ); /* Minimum legal index key size */ 006080 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){ 006081 rc = SQLITE_CORRUPT_PAGE(pPage); 006082 goto moveto_index_finish; 006083 } 006084 pCellKey = sqlite3Malloc( nCell+nOverrun ); 006085 if( pCellKey==0 ){ 006086 rc = SQLITE_NOMEM_BKPT; 006087 goto moveto_index_finish; 006088 } 006089 pCur->ix = (u16)idx; 006090 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); 006091 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */ 006092 pCur->curFlags &= ~BTCF_ValidOvfl; 006093 if( rc ){ 006094 sqlite3_free(pCellKey); 006095 goto moveto_index_finish; 006096 } 006097 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); 006098 sqlite3_free(pCellKey); 006099 } 006100 assert( 006101 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 006102 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 006103 ); 006104 if( c<0 ){ 006105 lwr = idx+1; 006106 }else if( c>0 ){ 006107 upr = idx-1; 006108 }else{ 006109 assert( c==0 ); 006110 *pRes = 0; 006111 rc = SQLITE_OK; 006112 pCur->ix = (u16)idx; 006113 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT; 006114 goto moveto_index_finish; 006115 } 006116 if( lwr>upr ) break; 006117 assert( lwr+upr>=0 ); 006118 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 006119 } 006120 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 006121 assert( pPage->isInit ); 006122 if( pPage->leaf ){ 006123 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB ); 006124 pCur->ix = (u16)idx; 006125 *pRes = c; 006126 rc = SQLITE_OK; 006127 goto moveto_index_finish; 006128 } 006129 if( lwr>=pPage->nCell ){ 006130 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 006131 }else{ 006132 chldPg = get4byte(findCell(pPage, lwr)); 006133 } 006134 006135 /* This block is similar to an in-lined version of: 006136 ** 006137 ** pCur->ix = (u16)lwr; 006138 ** rc = moveToChild(pCur, chldPg); 006139 ** if( rc ) break; 006140 */ 006141 pCur->info.nSize = 0; 006142 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006143 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 006144 return SQLITE_CORRUPT_BKPT; 006145 } 006146 pCur->aiIdx[pCur->iPage] = (u16)lwr; 006147 pCur->apPage[pCur->iPage] = pCur->pPage; 006148 pCur->ix = 0; 006149 pCur->iPage++; 006150 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags); 006151 if( rc==SQLITE_OK 006152 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey) 006153 ){ 006154 releasePage(pCur->pPage); 006155 rc = SQLITE_CORRUPT_PGNO(chldPg); 006156 } 006157 if( rc ){ 006158 pCur->pPage = pCur->apPage[--pCur->iPage]; 006159 break; 006160 } 006161 /* 006162 ***** End of in-lined moveToChild() call */ 006163 } 006164 moveto_index_finish: 006165 pCur->info.nSize = 0; 006166 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006167 return rc; 006168 } 006169 006170 006171 /* 006172 ** Return TRUE if the cursor is not pointing at an entry of the table. 006173 ** 006174 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 006175 ** past the last entry in the table or sqlite3BtreePrev() moves past 006176 ** the first entry. TRUE is also returned if the table is empty. 006177 */ 006178 int sqlite3BtreeEof(BtCursor *pCur){ 006179 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 006180 ** have been deleted? This API will need to change to return an error code 006181 ** as well as the boolean result value. 006182 */ 006183 return (CURSOR_VALID!=pCur->eState); 006184 } 006185 006186 /* 006187 ** Return an estimate for the number of rows in the table that pCur is 006188 ** pointing to. Return a negative number if no estimate is currently 006189 ** available. 006190 */ 006191 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){ 006192 i64 n; 006193 u8 i; 006194 006195 assert( cursorOwnsBtShared(pCur) ); 006196 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 006197 006198 /* Currently this interface is only called by the OP_IfSizeBetween 006199 ** opcode and the OP_Count opcode with P3=1. In either case, 006200 ** the cursor will always be valid unless the btree is empty. */ 006201 if( pCur->eState!=CURSOR_VALID ) return 0; 006202 if( NEVER(pCur->pPage->leaf==0) ) return -1; 006203 006204 n = pCur->pPage->nCell; 006205 for(i=0; i<pCur->iPage; i++){ 006206 n *= pCur->apPage[i]->nCell; 006207 } 006208 return n; 006209 } 006210 006211 /* 006212 ** Advance the cursor to the next entry in the database. 006213 ** Return value: 006214 ** 006215 ** SQLITE_OK success 006216 ** SQLITE_DONE cursor is already pointing at the last element 006217 ** otherwise some kind of error occurred 006218 ** 006219 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 006220 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 006221 ** to the next cell on the current page. The (slower) btreeNext() helper 006222 ** routine is called when it is necessary to move to a different page or 006223 ** to restore the cursor. 006224 ** 006225 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the 006226 ** cursor corresponds to an SQL index and this routine could have been 006227 ** skipped if the SQL index had been a unique index. The F argument 006228 ** is a hint to the implement. SQLite btree implementation does not use 006229 ** this hint, but COMDB2 does. 006230 */ 006231 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ 006232 int rc; 006233 int idx; 006234 MemPage *pPage; 006235 006236 assert( cursorOwnsBtShared(pCur) ); 006237 if( pCur->eState!=CURSOR_VALID ){ 006238 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 006239 rc = restoreCursorPosition(pCur); 006240 if( rc!=SQLITE_OK ){ 006241 return rc; 006242 } 006243 if( CURSOR_INVALID==pCur->eState ){ 006244 return SQLITE_DONE; 006245 } 006246 if( pCur->eState==CURSOR_SKIPNEXT ){ 006247 pCur->eState = CURSOR_VALID; 006248 if( pCur->skipNext>0 ) return SQLITE_OK; 006249 } 006250 } 006251 006252 pPage = pCur->pPage; 006253 idx = ++pCur->ix; 006254 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006255 if( !pPage->isInit ){ 006256 return SQLITE_CORRUPT_BKPT; 006257 } 006258 006259 if( idx>=pPage->nCell ){ 006260 if( !pPage->leaf ){ 006261 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 006262 if( rc ) return rc; 006263 return moveToLeftmost(pCur); 006264 } 006265 do{ 006266 if( pCur->iPage==0 ){ 006267 pCur->eState = CURSOR_INVALID; 006268 return SQLITE_DONE; 006269 } 006270 moveToParent(pCur); 006271 pPage = pCur->pPage; 006272 }while( pCur->ix>=pPage->nCell ); 006273 if( pPage->intKey ){ 006274 return sqlite3BtreeNext(pCur, 0); 006275 }else{ 006276 return SQLITE_OK; 006277 } 006278 } 006279 if( pPage->leaf ){ 006280 return SQLITE_OK; 006281 }else{ 006282 return moveToLeftmost(pCur); 006283 } 006284 } 006285 int sqlite3BtreeNext(BtCursor *pCur, int flags){ 006286 MemPage *pPage; 006287 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006288 assert( cursorOwnsBtShared(pCur) ); 006289 assert( flags==0 || flags==1 ); 006290 pCur->info.nSize = 0; 006291 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 006292 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur); 006293 pPage = pCur->pPage; 006294 if( (++pCur->ix)>=pPage->nCell ){ 006295 pCur->ix--; 006296 return btreeNext(pCur); 006297 } 006298 if( pPage->leaf ){ 006299 return SQLITE_OK; 006300 }else{ 006301 return moveToLeftmost(pCur); 006302 } 006303 } 006304 006305 /* 006306 ** Step the cursor to the back to the previous entry in the database. 006307 ** Return values: 006308 ** 006309 ** SQLITE_OK success 006310 ** SQLITE_DONE the cursor is already on the first element of the table 006311 ** otherwise some kind of error occurred 006312 ** 006313 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 006314 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 006315 ** to the previous cell on the current page. The (slower) btreePrevious() 006316 ** helper routine is called when it is necessary to move to a different page 006317 ** or to restore the cursor. 006318 ** 006319 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then 006320 ** the cursor corresponds to an SQL index and this routine could have been 006321 ** skipped if the SQL index had been a unique index. The F argument is a 006322 ** hint to the implement. The native SQLite btree implementation does not 006323 ** use this hint, but COMDB2 does. 006324 */ 006325 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){ 006326 int rc; 006327 MemPage *pPage; 006328 006329 assert( cursorOwnsBtShared(pCur) ); 006330 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 006331 assert( pCur->info.nSize==0 ); 006332 if( pCur->eState!=CURSOR_VALID ){ 006333 rc = restoreCursorPosition(pCur); 006334 if( rc!=SQLITE_OK ){ 006335 return rc; 006336 } 006337 if( CURSOR_INVALID==pCur->eState ){ 006338 return SQLITE_DONE; 006339 } 006340 if( CURSOR_SKIPNEXT==pCur->eState ){ 006341 pCur->eState = CURSOR_VALID; 006342 if( pCur->skipNext<0 ) return SQLITE_OK; 006343 } 006344 } 006345 006346 pPage = pCur->pPage; 006347 if( sqlite3FaultSim(412) ) pPage->isInit = 0; 006348 if( !pPage->isInit ){ 006349 return SQLITE_CORRUPT_BKPT; 006350 } 006351 if( !pPage->leaf ){ 006352 int idx = pCur->ix; 006353 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 006354 if( rc ) return rc; 006355 rc = moveToRightmost(pCur); 006356 }else{ 006357 while( pCur->ix==0 ){ 006358 if( pCur->iPage==0 ){ 006359 pCur->eState = CURSOR_INVALID; 006360 return SQLITE_DONE; 006361 } 006362 moveToParent(pCur); 006363 } 006364 assert( pCur->info.nSize==0 ); 006365 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 ); 006366 006367 pCur->ix--; 006368 pPage = pCur->pPage; 006369 if( pPage->intKey && !pPage->leaf ){ 006370 rc = sqlite3BtreePrevious(pCur, 0); 006371 }else{ 006372 rc = SQLITE_OK; 006373 } 006374 } 006375 return rc; 006376 } 006377 int sqlite3BtreePrevious(BtCursor *pCur, int flags){ 006378 assert( cursorOwnsBtShared(pCur) ); 006379 assert( flags==0 || flags==1 ); 006380 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */ 006381 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 006382 pCur->info.nSize = 0; 006383 if( pCur->eState!=CURSOR_VALID 006384 || pCur->ix==0 006385 || pCur->pPage->leaf==0 006386 ){ 006387 return btreePrevious(pCur); 006388 } 006389 pCur->ix--; 006390 return SQLITE_OK; 006391 } 006392 006393 /* 006394 ** Allocate a new page from the database file. 006395 ** 006396 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 006397 ** has already been called on the new page.) The new page has also 006398 ** been referenced and the calling routine is responsible for calling 006399 ** sqlite3PagerUnref() on the new page when it is done. 006400 ** 006401 ** SQLITE_OK is returned on success. Any other return value indicates 006402 ** an error. *ppPage is set to NULL in the event of an error. 006403 ** 006404 ** If the "nearby" parameter is not 0, then an effort is made to 006405 ** locate a page close to the page number "nearby". This can be used in an 006406 ** attempt to keep related pages close to each other in the database file, 006407 ** which in turn can make database access faster. 006408 ** 006409 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 006410 ** anywhere on the free-list, then it is guaranteed to be returned. If 006411 ** eMode is BTALLOC_LT then the page returned will be less than or equal 006412 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 006413 ** are no restrictions on which page is returned. 006414 */ 006415 static int allocateBtreePage( 006416 BtShared *pBt, /* The btree */ 006417 MemPage **ppPage, /* Store pointer to the allocated page here */ 006418 Pgno *pPgno, /* Store the page number here */ 006419 Pgno nearby, /* Search for a page near this one */ 006420 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 006421 ){ 006422 MemPage *pPage1; 006423 int rc; 006424 u32 n; /* Number of pages on the freelist */ 006425 u32 k; /* Number of leaves on the trunk of the freelist */ 006426 MemPage *pTrunk = 0; 006427 MemPage *pPrevTrunk = 0; 006428 Pgno mxPage; /* Total size of the database file */ 006429 006430 assert( sqlite3_mutex_held(pBt->mutex) ); 006431 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 006432 pPage1 = pBt->pPage1; 006433 mxPage = btreePagecount(pBt); 006434 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 006435 ** stores the total number of pages on the freelist. */ 006436 n = get4byte(&pPage1->aData[36]); 006437 testcase( n==mxPage-1 ); 006438 if( n>=mxPage ){ 006439 return SQLITE_CORRUPT_BKPT; 006440 } 006441 if( n>0 ){ 006442 /* There are pages on the freelist. Reuse one of those pages. */ 006443 Pgno iTrunk; 006444 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 006445 u32 nSearch = 0; /* Count of the number of search attempts */ 006446 006447 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 006448 ** shows that the page 'nearby' is somewhere on the free-list, then 006449 ** the entire-list will be searched for that page. 006450 */ 006451 #ifndef SQLITE_OMIT_AUTOVACUUM 006452 if( eMode==BTALLOC_EXACT ){ 006453 if( nearby<=mxPage ){ 006454 u8 eType; 006455 assert( nearby>0 ); 006456 assert( pBt->autoVacuum ); 006457 rc = ptrmapGet(pBt, nearby, &eType, 0); 006458 if( rc ) return rc; 006459 if( eType==PTRMAP_FREEPAGE ){ 006460 searchList = 1; 006461 } 006462 } 006463 }else if( eMode==BTALLOC_LE ){ 006464 searchList = 1; 006465 } 006466 #endif 006467 006468 /* Decrement the free-list count by 1. Set iTrunk to the index of the 006469 ** first free-list trunk page. iPrevTrunk is initially 1. 006470 */ 006471 rc = sqlite3PagerWrite(pPage1->pDbPage); 006472 if( rc ) return rc; 006473 put4byte(&pPage1->aData[36], n-1); 006474 006475 /* The code within this loop is run only once if the 'searchList' variable 006476 ** is not true. Otherwise, it runs once for each trunk-page on the 006477 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 006478 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 006479 */ 006480 do { 006481 pPrevTrunk = pTrunk; 006482 if( pPrevTrunk ){ 006483 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page 006484 ** is the page number of the next freelist trunk page in the list or 006485 ** zero if this is the last freelist trunk page. */ 006486 iTrunk = get4byte(&pPrevTrunk->aData[0]); 006487 }else{ 006488 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32 006489 ** stores the page number of the first page of the freelist, or zero if 006490 ** the freelist is empty. */ 006491 iTrunk = get4byte(&pPage1->aData[32]); 006492 } 006493 testcase( iTrunk==mxPage ); 006494 if( iTrunk>mxPage || nSearch++ > n ){ 006495 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1); 006496 }else{ 006497 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0); 006498 } 006499 if( rc ){ 006500 pTrunk = 0; 006501 goto end_allocate_page; 006502 } 006503 assert( pTrunk!=0 ); 006504 assert( pTrunk->aData!=0 ); 006505 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page 006506 ** is the number of leaf page pointers to follow. */ 006507 k = get4byte(&pTrunk->aData[4]); 006508 if( k==0 && !searchList ){ 006509 /* The trunk has no leaves and the list is not being searched. 006510 ** So extract the trunk page itself and use it as the newly 006511 ** allocated page */ 006512 assert( pPrevTrunk==0 ); 006513 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006514 if( rc ){ 006515 goto end_allocate_page; 006516 } 006517 *pPgno = iTrunk; 006518 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006519 *ppPage = pTrunk; 006520 pTrunk = 0; 006521 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006522 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 006523 /* Value of k is out of range. Database corruption */ 006524 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006525 goto end_allocate_page; 006526 #ifndef SQLITE_OMIT_AUTOVACUUM 006527 }else if( searchList 006528 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 006529 ){ 006530 /* The list is being searched and this trunk page is the page 006531 ** to allocate, regardless of whether it has leaves. 006532 */ 006533 *pPgno = iTrunk; 006534 *ppPage = pTrunk; 006535 searchList = 0; 006536 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006537 if( rc ){ 006538 goto end_allocate_page; 006539 } 006540 if( k==0 ){ 006541 if( !pPrevTrunk ){ 006542 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 006543 }else{ 006544 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006545 if( rc!=SQLITE_OK ){ 006546 goto end_allocate_page; 006547 } 006548 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 006549 } 006550 }else{ 006551 /* The trunk page is required by the caller but it contains 006552 ** pointers to free-list leaves. The first leaf becomes a trunk 006553 ** page in this case. 006554 */ 006555 MemPage *pNewTrunk; 006556 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 006557 if( iNewTrunk>mxPage ){ 006558 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006559 goto end_allocate_page; 006560 } 006561 testcase( iNewTrunk==mxPage ); 006562 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0); 006563 if( rc!=SQLITE_OK ){ 006564 goto end_allocate_page; 006565 } 006566 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 006567 if( rc!=SQLITE_OK ){ 006568 releasePage(pNewTrunk); 006569 goto end_allocate_page; 006570 } 006571 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 006572 put4byte(&pNewTrunk->aData[4], k-1); 006573 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 006574 releasePage(pNewTrunk); 006575 if( !pPrevTrunk ){ 006576 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 006577 put4byte(&pPage1->aData[32], iNewTrunk); 006578 }else{ 006579 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 006580 if( rc ){ 006581 goto end_allocate_page; 006582 } 006583 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 006584 } 006585 } 006586 pTrunk = 0; 006587 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1)); 006588 #endif 006589 }else if( k>0 ){ 006590 /* Extract a leaf from the trunk */ 006591 u32 closest; 006592 Pgno iPage; 006593 unsigned char *aData = pTrunk->aData; 006594 if( nearby>0 ){ 006595 u32 i; 006596 closest = 0; 006597 if( eMode==BTALLOC_LE ){ 006598 for(i=0; i<k; i++){ 006599 iPage = get4byte(&aData[8+i*4]); 006600 if( iPage<=nearby ){ 006601 closest = i; 006602 break; 006603 } 006604 } 006605 }else{ 006606 int dist; 006607 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 006608 for(i=1; i<k; i++){ 006609 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 006610 if( d2<dist ){ 006611 closest = i; 006612 dist = d2; 006613 } 006614 } 006615 } 006616 }else{ 006617 closest = 0; 006618 } 006619 006620 iPage = get4byte(&aData[8+closest*4]); 006621 testcase( iPage==mxPage ); 006622 if( iPage>mxPage || iPage<2 ){ 006623 rc = SQLITE_CORRUPT_PGNO(iTrunk); 006624 goto end_allocate_page; 006625 } 006626 testcase( iPage==mxPage ); 006627 if( !searchList 006628 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 006629 ){ 006630 int noContent; 006631 *pPgno = iPage; 006632 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u" 006633 ": %u more free pages\n", 006634 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 006635 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006636 if( rc ) goto end_allocate_page; 006637 if( closest<k-1 ){ 006638 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 006639 } 006640 put4byte(&aData[4], k-1); 006641 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 006642 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent); 006643 if( rc==SQLITE_OK ){ 006644 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006645 if( rc!=SQLITE_OK ){ 006646 releasePage(*ppPage); 006647 *ppPage = 0; 006648 } 006649 } 006650 searchList = 0; 006651 } 006652 } 006653 releasePage(pPrevTrunk); 006654 pPrevTrunk = 0; 006655 }while( searchList ); 006656 }else{ 006657 /* There are no pages on the freelist, so append a new page to the 006658 ** database image. 006659 ** 006660 ** Normally, new pages allocated by this block can be requested from the 006661 ** pager layer with the 'no-content' flag set. This prevents the pager 006662 ** from trying to read the pages content from disk. However, if the 006663 ** current transaction has already run one or more incremental-vacuum 006664 ** steps, then the page we are about to allocate may contain content 006665 ** that is required in the event of a rollback. In this case, do 006666 ** not set the no-content flag. This causes the pager to load and journal 006667 ** the current page content before overwriting it. 006668 ** 006669 ** Note that the pager will not actually attempt to load or journal 006670 ** content for any page that really does lie past the end of the database 006671 ** file on disk. So the effects of disabling the no-content optimization 006672 ** here are confined to those pages that lie between the end of the 006673 ** database image and the end of the database file. 006674 */ 006675 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 006676 006677 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 006678 if( rc ) return rc; 006679 pBt->nPage++; 006680 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 006681 006682 #ifndef SQLITE_OMIT_AUTOVACUUM 006683 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 006684 /* If *pPgno refers to a pointer-map page, allocate two new pages 006685 ** at the end of the file instead of one. The first allocated page 006686 ** becomes a new pointer-map page, the second is used by the caller. 006687 */ 006688 MemPage *pPg = 0; 006689 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage)); 006690 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 006691 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent); 006692 if( rc==SQLITE_OK ){ 006693 rc = sqlite3PagerWrite(pPg->pDbPage); 006694 releasePage(pPg); 006695 } 006696 if( rc ) return rc; 006697 pBt->nPage++; 006698 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 006699 } 006700 #endif 006701 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 006702 *pPgno = pBt->nPage; 006703 006704 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006705 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent); 006706 if( rc ) return rc; 006707 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 006708 if( rc!=SQLITE_OK ){ 006709 releasePage(*ppPage); 006710 *ppPage = 0; 006711 } 006712 TRACE(("ALLOCATE: %u from end of file\n", *pPgno)); 006713 } 006714 006715 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) ); 006716 006717 end_allocate_page: 006718 releasePage(pTrunk); 006719 releasePage(pPrevTrunk); 006720 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 ); 006721 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 ); 006722 return rc; 006723 } 006724 006725 /* 006726 ** This function is used to add page iPage to the database file free-list. 006727 ** It is assumed that the page is not already a part of the free-list. 006728 ** 006729 ** The value passed as the second argument to this function is optional. 006730 ** If the caller happens to have a pointer to the MemPage object 006731 ** corresponding to page iPage handy, it may pass it as the second value. 006732 ** Otherwise, it may pass NULL. 006733 ** 006734 ** If a pointer to a MemPage object is passed as the second argument, 006735 ** its reference count is not altered by this function. 006736 */ 006737 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 006738 MemPage *pTrunk = 0; /* Free-list trunk page */ 006739 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 006740 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 006741 MemPage *pPage; /* Page being freed. May be NULL. */ 006742 int rc; /* Return Code */ 006743 u32 nFree; /* Initial number of pages on free-list */ 006744 006745 assert( sqlite3_mutex_held(pBt->mutex) ); 006746 assert( CORRUPT_DB || iPage>1 ); 006747 assert( !pMemPage || pMemPage->pgno==iPage ); 006748 006749 if( iPage<2 || iPage>pBt->nPage ){ 006750 return SQLITE_CORRUPT_BKPT; 006751 } 006752 if( pMemPage ){ 006753 pPage = pMemPage; 006754 sqlite3PagerRef(pPage->pDbPage); 006755 }else{ 006756 pPage = btreePageLookup(pBt, iPage); 006757 } 006758 006759 /* Increment the free page count on pPage1 */ 006760 rc = sqlite3PagerWrite(pPage1->pDbPage); 006761 if( rc ) goto freepage_out; 006762 nFree = get4byte(&pPage1->aData[36]); 006763 put4byte(&pPage1->aData[36], nFree+1); 006764 006765 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 006766 /* If the secure_delete option is enabled, then 006767 ** always fully overwrite deleted information with zeros. 006768 */ 006769 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 006770 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 006771 ){ 006772 goto freepage_out; 006773 } 006774 memset(pPage->aData, 0, pPage->pBt->pageSize); 006775 } 006776 006777 /* If the database supports auto-vacuum, write an entry in the pointer-map 006778 ** to indicate that the page is free. 006779 */ 006780 if( ISAUTOVACUUM(pBt) ){ 006781 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 006782 if( rc ) goto freepage_out; 006783 } 006784 006785 /* Now manipulate the actual database free-list structure. There are two 006786 ** possibilities. If the free-list is currently empty, or if the first 006787 ** trunk page in the free-list is full, then this page will become a 006788 ** new free-list trunk page. Otherwise, it will become a leaf of the 006789 ** first trunk page in the current free-list. This block tests if it 006790 ** is possible to add the page as a new free-list leaf. 006791 */ 006792 if( nFree!=0 ){ 006793 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 006794 006795 iTrunk = get4byte(&pPage1->aData[32]); 006796 if( iTrunk>btreePagecount(pBt) ){ 006797 rc = SQLITE_CORRUPT_BKPT; 006798 goto freepage_out; 006799 } 006800 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 006801 if( rc!=SQLITE_OK ){ 006802 goto freepage_out; 006803 } 006804 006805 nLeaf = get4byte(&pTrunk->aData[4]); 006806 assert( pBt->usableSize>32 ); 006807 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 006808 rc = SQLITE_CORRUPT_BKPT; 006809 goto freepage_out; 006810 } 006811 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 006812 /* In this case there is room on the trunk page to insert the page 006813 ** being freed as a new leaf. 006814 ** 006815 ** Note that the trunk page is not really full until it contains 006816 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 006817 ** coded. But due to a coding error in versions of SQLite prior to 006818 ** 3.6.0, databases with freelist trunk pages holding more than 006819 ** usableSize/4 - 8 entries will be reported as corrupt. In order 006820 ** to maintain backwards compatibility with older versions of SQLite, 006821 ** we will continue to restrict the number of entries to usableSize/4 - 8 006822 ** for now. At some point in the future (once everyone has upgraded 006823 ** to 3.6.0 or later) we should consider fixing the conditional above 006824 ** to read "usableSize/4-2" instead of "usableSize/4-8". 006825 ** 006826 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still 006827 ** avoid using the last six entries in the freelist trunk page array in 006828 ** order that database files created by newer versions of SQLite can be 006829 ** read by older versions of SQLite. 006830 */ 006831 rc = sqlite3PagerWrite(pTrunk->pDbPage); 006832 if( rc==SQLITE_OK ){ 006833 put4byte(&pTrunk->aData[4], nLeaf+1); 006834 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 006835 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 006836 sqlite3PagerDontWrite(pPage->pDbPage); 006837 } 006838 rc = btreeSetHasContent(pBt, iPage); 006839 } 006840 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno)); 006841 goto freepage_out; 006842 } 006843 } 006844 006845 /* If control flows to this point, then it was not possible to add the 006846 ** the page being freed as a leaf page of the first trunk in the free-list. 006847 ** Possibly because the free-list is empty, or possibly because the 006848 ** first trunk in the free-list is full. Either way, the page being freed 006849 ** will become the new first trunk page in the free-list. 006850 */ 006851 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 006852 goto freepage_out; 006853 } 006854 rc = sqlite3PagerWrite(pPage->pDbPage); 006855 if( rc!=SQLITE_OK ){ 006856 goto freepage_out; 006857 } 006858 put4byte(pPage->aData, iTrunk); 006859 put4byte(&pPage->aData[4], 0); 006860 put4byte(&pPage1->aData[32], iPage); 006861 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk)); 006862 006863 freepage_out: 006864 if( pPage ){ 006865 pPage->isInit = 0; 006866 } 006867 releasePage(pPage); 006868 releasePage(pTrunk); 006869 return rc; 006870 } 006871 static void freePage(MemPage *pPage, int *pRC){ 006872 if( (*pRC)==SQLITE_OK ){ 006873 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 006874 } 006875 } 006876 006877 /* 006878 ** Free the overflow pages associated with the given Cell. 006879 */ 006880 static SQLITE_NOINLINE int clearCellOverflow( 006881 MemPage *pPage, /* The page that contains the Cell */ 006882 unsigned char *pCell, /* First byte of the Cell */ 006883 CellInfo *pInfo /* Size information about the cell */ 006884 ){ 006885 BtShared *pBt; 006886 Pgno ovflPgno; 006887 int rc; 006888 int nOvfl; 006889 u32 ovflPageSize; 006890 006891 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006892 assert( pInfo->nLocal!=pInfo->nPayload ); 006893 testcase( pCell + pInfo->nSize == pPage->aDataEnd ); 006894 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd ); 006895 if( pCell + pInfo->nSize > pPage->aDataEnd ){ 006896 /* Cell extends past end of page */ 006897 return SQLITE_CORRUPT_PAGE(pPage); 006898 } 006899 ovflPgno = get4byte(pCell + pInfo->nSize - 4); 006900 pBt = pPage->pBt; 006901 assert( pBt->usableSize > 4 ); 006902 ovflPageSize = pBt->usableSize - 4; 006903 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize; 006904 assert( nOvfl>0 || 006905 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize) 006906 ); 006907 while( nOvfl-- ){ 006908 Pgno iNext = 0; 006909 MemPage *pOvfl = 0; 006910 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 006911 /* 0 is not a legal page number and page 1 cannot be an 006912 ** overflow page. Therefore if ovflPgno<2 or past the end of the 006913 ** file the database must be corrupt. */ 006914 return SQLITE_CORRUPT_BKPT; 006915 } 006916 if( nOvfl ){ 006917 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 006918 if( rc ) return rc; 006919 } 006920 006921 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 006922 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 006923 ){ 006924 /* There is no reason any cursor should have an outstanding reference 006925 ** to an overflow page belonging to a cell that is being deleted/updated. 006926 ** So if there exists more than one reference to this page, then it 006927 ** must not really be an overflow page and the database must be corrupt. 006928 ** It is helpful to detect this before calling freePage2(), as 006929 ** freePage2() may zero the page contents if secure-delete mode is 006930 ** enabled. If this 'overflow' page happens to be a page that the 006931 ** caller is iterating through or using in some other way, this 006932 ** can be problematic. 006933 */ 006934 rc = SQLITE_CORRUPT_BKPT; 006935 }else{ 006936 rc = freePage2(pBt, pOvfl, ovflPgno); 006937 } 006938 006939 if( pOvfl ){ 006940 sqlite3PagerUnref(pOvfl->pDbPage); 006941 } 006942 if( rc ) return rc; 006943 ovflPgno = iNext; 006944 } 006945 return SQLITE_OK; 006946 } 006947 006948 /* Call xParseCell to compute the size of a cell. If the cell contains 006949 ** overflow, then invoke cellClearOverflow to clear out that overflow. 006950 ** Store the result code (SQLITE_OK or some error code) in rc. 006951 ** 006952 ** Implemented as macro to force inlining for performance. 006953 */ 006954 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \ 006955 pPage->xParseCell(pPage, pCell, &sInfo); \ 006956 if( sInfo.nLocal!=sInfo.nPayload ){ \ 006957 rc = clearCellOverflow(pPage, pCell, &sInfo); \ 006958 }else{ \ 006959 rc = SQLITE_OK; \ 006960 } 006961 006962 006963 /* 006964 ** Create the byte sequence used to represent a cell on page pPage 006965 ** and write that byte sequence into pCell[]. Overflow pages are 006966 ** allocated and filled in as necessary. The calling procedure 006967 ** is responsible for making sure sufficient space has been allocated 006968 ** for pCell[]. 006969 ** 006970 ** Note that pCell does not necessary need to point to the pPage->aData 006971 ** area. pCell might point to some temporary storage. The cell will 006972 ** be constructed in this temporary area then copied into pPage->aData 006973 ** later. 006974 */ 006975 static int fillInCell( 006976 MemPage *pPage, /* The page that contains the cell */ 006977 unsigned char *pCell, /* Complete text of the cell */ 006978 const BtreePayload *pX, /* Payload with which to construct the cell */ 006979 int *pnSize /* Write cell size here */ 006980 ){ 006981 int nPayload; 006982 const u8 *pSrc; 006983 int nSrc, n, rc, mn; 006984 int spaceLeft; 006985 MemPage *pToRelease; 006986 unsigned char *pPrior; 006987 unsigned char *pPayload; 006988 BtShared *pBt; 006989 Pgno pgnoOvfl; 006990 int nHeader; 006991 006992 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 006993 006994 /* pPage is not necessarily writeable since pCell might be auxiliary 006995 ** buffer space that is separate from the pPage buffer area */ 006996 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize] 006997 || sqlite3PagerIswriteable(pPage->pDbPage) ); 006998 006999 /* Fill in the header. */ 007000 nHeader = pPage->childPtrSize; 007001 if( pPage->intKey ){ 007002 nPayload = pX->nData + pX->nZero; 007003 pSrc = pX->pData; 007004 nSrc = pX->nData; 007005 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */ 007006 nHeader += putVarint32(&pCell[nHeader], nPayload); 007007 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey); 007008 }else{ 007009 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 ); 007010 nSrc = nPayload = (int)pX->nKey; 007011 pSrc = pX->pKey; 007012 nHeader += putVarint32(&pCell[nHeader], nPayload); 007013 } 007014 007015 /* Fill in the payload */ 007016 pPayload = &pCell[nHeader]; 007017 if( nPayload<=pPage->maxLocal ){ 007018 /* This is the common case where everything fits on the btree page 007019 ** and no overflow pages are required. */ 007020 n = nHeader + nPayload; 007021 testcase( n==3 ); 007022 testcase( n==4 ); 007023 if( n<4 ){ 007024 n = 4; 007025 pPayload[nPayload] = 0; 007026 } 007027 *pnSize = n; 007028 assert( nSrc<=nPayload ); 007029 testcase( nSrc<nPayload ); 007030 memcpy(pPayload, pSrc, nSrc); 007031 memset(pPayload+nSrc, 0, nPayload-nSrc); 007032 return SQLITE_OK; 007033 } 007034 007035 /* If we reach this point, it means that some of the content will need 007036 ** to spill onto overflow pages. 007037 */ 007038 mn = pPage->minLocal; 007039 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 007040 testcase( n==pPage->maxLocal ); 007041 testcase( n==pPage->maxLocal+1 ); 007042 if( n > pPage->maxLocal ) n = mn; 007043 spaceLeft = n; 007044 *pnSize = n + nHeader + 4; 007045 pPrior = &pCell[nHeader+n]; 007046 pToRelease = 0; 007047 pgnoOvfl = 0; 007048 pBt = pPage->pBt; 007049 007050 /* At this point variables should be set as follows: 007051 ** 007052 ** nPayload Total payload size in bytes 007053 ** pPayload Begin writing payload here 007054 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 007055 ** that means content must spill into overflow pages. 007056 ** *pnSize Size of the local cell (not counting overflow pages) 007057 ** pPrior Where to write the pgno of the first overflow page 007058 ** 007059 ** Use a call to btreeParseCellPtr() to verify that the values above 007060 ** were computed correctly. 007061 */ 007062 #ifdef SQLITE_DEBUG 007063 { 007064 CellInfo info; 007065 pPage->xParseCell(pPage, pCell, &info); 007066 assert( nHeader==(int)(info.pPayload - pCell) ); 007067 assert( info.nKey==pX->nKey ); 007068 assert( *pnSize == info.nSize ); 007069 assert( spaceLeft == info.nLocal ); 007070 } 007071 #endif 007072 007073 /* Write the payload into the local Cell and any extra into overflow pages */ 007074 while( 1 ){ 007075 n = nPayload; 007076 if( n>spaceLeft ) n = spaceLeft; 007077 007078 /* If pToRelease is not zero than pPayload points into the data area 007079 ** of pToRelease. Make sure pToRelease is still writeable. */ 007080 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007081 007082 /* If pPayload is part of the data area of pPage, then make sure pPage 007083 ** is still writeable */ 007084 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 007085 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007086 007087 if( nSrc>=n ){ 007088 memcpy(pPayload, pSrc, n); 007089 }else if( nSrc>0 ){ 007090 n = nSrc; 007091 memcpy(pPayload, pSrc, n); 007092 }else{ 007093 memset(pPayload, 0, n); 007094 } 007095 nPayload -= n; 007096 if( nPayload<=0 ) break; 007097 pPayload += n; 007098 pSrc += n; 007099 nSrc -= n; 007100 spaceLeft -= n; 007101 if( spaceLeft==0 ){ 007102 MemPage *pOvfl = 0; 007103 #ifndef SQLITE_OMIT_AUTOVACUUM 007104 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 007105 if( pBt->autoVacuum ){ 007106 do{ 007107 pgnoOvfl++; 007108 } while( 007109 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 007110 ); 007111 } 007112 #endif 007113 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 007114 #ifndef SQLITE_OMIT_AUTOVACUUM 007115 /* If the database supports auto-vacuum, and the second or subsequent 007116 ** overflow page is being allocated, add an entry to the pointer-map 007117 ** for that page now. 007118 ** 007119 ** If this is the first overflow page, then write a partial entry 007120 ** to the pointer-map. If we write nothing to this pointer-map slot, 007121 ** then the optimistic overflow chain processing in clearCell() 007122 ** may misinterpret the uninitialized values and delete the 007123 ** wrong pages from the database. 007124 */ 007125 if( pBt->autoVacuum && rc==SQLITE_OK ){ 007126 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 007127 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 007128 if( rc ){ 007129 releasePage(pOvfl); 007130 } 007131 } 007132 #endif 007133 if( rc ){ 007134 releasePage(pToRelease); 007135 return rc; 007136 } 007137 007138 /* If pToRelease is not zero than pPrior points into the data area 007139 ** of pToRelease. Make sure pToRelease is still writeable. */ 007140 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 007141 007142 /* If pPrior is part of the data area of pPage, then make sure pPage 007143 ** is still writeable */ 007144 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 007145 || sqlite3PagerIswriteable(pPage->pDbPage) ); 007146 007147 put4byte(pPrior, pgnoOvfl); 007148 releasePage(pToRelease); 007149 pToRelease = pOvfl; 007150 pPrior = pOvfl->aData; 007151 put4byte(pPrior, 0); 007152 pPayload = &pOvfl->aData[4]; 007153 spaceLeft = pBt->usableSize - 4; 007154 } 007155 } 007156 releasePage(pToRelease); 007157 return SQLITE_OK; 007158 } 007159 007160 /* 007161 ** Remove the i-th cell from pPage. This routine effects pPage only. 007162 ** The cell content is not freed or deallocated. It is assumed that 007163 ** the cell content has been copied someplace else. This routine just 007164 ** removes the reference to the cell from pPage. 007165 ** 007166 ** "sz" must be the number of bytes in the cell. 007167 */ 007168 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 007169 u32 pc; /* Offset to cell content of cell being deleted */ 007170 u8 *data; /* pPage->aData */ 007171 u8 *ptr; /* Used to move bytes around within data[] */ 007172 int rc; /* The return code */ 007173 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 007174 007175 if( *pRC ) return; 007176 assert( idx>=0 ); 007177 assert( idx<pPage->nCell ); 007178 assert( CORRUPT_DB || sz==cellSize(pPage, idx) ); 007179 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007180 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007181 assert( pPage->nFree>=0 ); 007182 data = pPage->aData; 007183 ptr = &pPage->aCellIdx[2*idx]; 007184 assert( pPage->pBt->usableSize > (u32)(ptr-data) ); 007185 pc = get2byte(ptr); 007186 hdr = pPage->hdrOffset; 007187 testcase( pc==(u32)get2byte(&data[hdr+5]) ); 007188 testcase( pc+sz==pPage->pBt->usableSize ); 007189 if( pc+sz > pPage->pBt->usableSize ){ 007190 *pRC = SQLITE_CORRUPT_BKPT; 007191 return; 007192 } 007193 rc = freeSpace(pPage, pc, sz); 007194 if( rc ){ 007195 *pRC = rc; 007196 return; 007197 } 007198 pPage->nCell--; 007199 if( pPage->nCell==0 ){ 007200 memset(&data[hdr+1], 0, 4); 007201 data[hdr+7] = 0; 007202 put2byte(&data[hdr+5], pPage->pBt->usableSize); 007203 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset 007204 - pPage->childPtrSize - 8; 007205 }else{ 007206 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 007207 put2byte(&data[hdr+3], pPage->nCell); 007208 pPage->nFree += 2; 007209 } 007210 } 007211 007212 /* 007213 ** Insert a new cell on pPage at cell index "i". pCell points to the 007214 ** content of the cell. 007215 ** 007216 ** If the cell content will fit on the page, then put it there. If it 007217 ** will not fit, then make a copy of the cell content into pTemp if 007218 ** pTemp is not null. Regardless of pTemp, allocate a new entry 007219 ** in pPage->apOvfl[] and make it point to the cell content (either 007220 ** in pTemp or the original pCell) and also record its index. 007221 ** Allocating a new entry in pPage->aCell[] implies that 007222 ** pPage->nOverflow is incremented. 007223 ** 007224 ** The insertCellFast() routine below works exactly the same as 007225 ** insertCell() except that it lacks the pTemp and iChild parameters 007226 ** which are assumed zero. Other than that, the two routines are the 007227 ** same. 007228 ** 007229 ** Fixes or enhancements to this routine should be reflected in 007230 ** insertCellFast()! 007231 */ 007232 static int insertCell( 007233 MemPage *pPage, /* Page into which we are copying */ 007234 int i, /* New cell becomes the i-th cell of the page */ 007235 u8 *pCell, /* Content of the new cell */ 007236 int sz, /* Bytes of content in pCell */ 007237 u8 *pTemp, /* Temp storage space for pCell, if needed */ 007238 Pgno iChild /* If non-zero, replace first 4 bytes with this value */ 007239 ){ 007240 int idx = 0; /* Where to write new cell content in data[] */ 007241 int j; /* Loop counter */ 007242 u8 *data; /* The content of the whole page */ 007243 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007244 007245 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007246 assert( MX_CELL(pPage->pBt)<=10921 ); 007247 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007248 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007249 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007250 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007251 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007252 assert( pPage->nFree>=0 ); 007253 assert( iChild>0 ); 007254 if( pPage->nOverflow || sz+2>pPage->nFree ){ 007255 if( pTemp ){ 007256 memcpy(pTemp, pCell, sz); 007257 pCell = pTemp; 007258 } 007259 put4byte(pCell, iChild); 007260 j = pPage->nOverflow++; 007261 /* Comparison against ArraySize-1 since we hold back one extra slot 007262 ** as a contingency. In other words, never need more than 3 overflow 007263 ** slots but 4 are allocated, just to be safe. */ 007264 assert( j < ArraySize(pPage->apOvfl)-1 ); 007265 pPage->apOvfl[j] = pCell; 007266 pPage->aiOvfl[j] = (u16)i; 007267 007268 /* When multiple overflows occur, they are always sequential and in 007269 ** sorted order. This invariants arise because multiple overflows can 007270 ** only occur when inserting divider cells into the parent page during 007271 ** balancing, and the dividers are adjacent and sorted. 007272 */ 007273 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007274 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007275 }else{ 007276 int rc = sqlite3PagerWrite(pPage->pDbPage); 007277 if( NEVER(rc!=SQLITE_OK) ){ 007278 return rc; 007279 } 007280 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007281 data = pPage->aData; 007282 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007283 rc = allocateSpace(pPage, sz, &idx); 007284 if( rc ){ return rc; } 007285 /* The allocateSpace() routine guarantees the following properties 007286 ** if it returns successfully */ 007287 assert( idx >= 0 ); 007288 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007289 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007290 pPage->nFree -= (u16)(2 + sz); 007291 /* In a corrupt database where an entry in the cell index section of 007292 ** a btree page has a value of 3 or less, the pCell value might point 007293 ** as many as 4 bytes in front of the start of the aData buffer for 007294 ** the source page. Make sure this does not cause problems by not 007295 ** reading the first 4 bytes */ 007296 memcpy(&data[idx+4], pCell+4, sz-4); 007297 put4byte(&data[idx], iChild); 007298 pIns = pPage->aCellIdx + i*2; 007299 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007300 put2byte(pIns, idx); 007301 pPage->nCell++; 007302 /* increment the cell count */ 007303 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007304 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007305 #ifndef SQLITE_OMIT_AUTOVACUUM 007306 if( pPage->pBt->autoVacuum ){ 007307 int rc2 = SQLITE_OK; 007308 /* The cell may contain a pointer to an overflow page. If so, write 007309 ** the entry for the overflow page into the pointer map. 007310 */ 007311 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007312 if( rc2 ) return rc2; 007313 } 007314 #endif 007315 } 007316 return SQLITE_OK; 007317 } 007318 007319 /* 007320 ** This variant of insertCell() assumes that the pTemp and iChild 007321 ** parameters are both zero. Use this variant in sqlite3BtreeInsert() 007322 ** for performance improvement, and also so that this variant is only 007323 ** called from that one place, and is thus inlined, and thus runs must 007324 ** faster. 007325 ** 007326 ** Fixes or enhancements to this routine should be reflected into 007327 ** the insertCell() routine. 007328 */ 007329 static int insertCellFast( 007330 MemPage *pPage, /* Page into which we are copying */ 007331 int i, /* New cell becomes the i-th cell of the page */ 007332 u8 *pCell, /* Content of the new cell */ 007333 int sz /* Bytes of content in pCell */ 007334 ){ 007335 int idx = 0; /* Where to write new cell content in data[] */ 007336 int j; /* Loop counter */ 007337 u8 *data; /* The content of the whole page */ 007338 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */ 007339 007340 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 007341 assert( MX_CELL(pPage->pBt)<=10921 ); 007342 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 007343 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 007344 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 007345 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007346 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB ); 007347 assert( pPage->nFree>=0 ); 007348 assert( pPage->nOverflow==0 ); 007349 if( sz+2>pPage->nFree ){ 007350 j = pPage->nOverflow++; 007351 /* Comparison against ArraySize-1 since we hold back one extra slot 007352 ** as a contingency. In other words, never need more than 3 overflow 007353 ** slots but 4 are allocated, just to be safe. */ 007354 assert( j < ArraySize(pPage->apOvfl)-1 ); 007355 pPage->apOvfl[j] = pCell; 007356 pPage->aiOvfl[j] = (u16)i; 007357 007358 /* When multiple overflows occur, they are always sequential and in 007359 ** sorted order. This invariants arise because multiple overflows can 007360 ** only occur when inserting divider cells into the parent page during 007361 ** balancing, and the dividers are adjacent and sorted. 007362 */ 007363 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */ 007364 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */ 007365 }else{ 007366 int rc = sqlite3PagerWrite(pPage->pDbPage); 007367 if( rc!=SQLITE_OK ){ 007368 return rc; 007369 } 007370 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 007371 data = pPage->aData; 007372 assert( &data[pPage->cellOffset]==pPage->aCellIdx ); 007373 rc = allocateSpace(pPage, sz, &idx); 007374 if( rc ){ return rc; } 007375 /* The allocateSpace() routine guarantees the following properties 007376 ** if it returns successfully */ 007377 assert( idx >= 0 ); 007378 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB ); 007379 assert( idx+sz <= (int)pPage->pBt->usableSize ); 007380 pPage->nFree -= (u16)(2 + sz); 007381 memcpy(&data[idx], pCell, sz); 007382 pIns = pPage->aCellIdx + i*2; 007383 memmove(pIns+2, pIns, 2*(pPage->nCell - i)); 007384 put2byte(pIns, idx); 007385 pPage->nCell++; 007386 /* increment the cell count */ 007387 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++; 007388 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB ); 007389 #ifndef SQLITE_OMIT_AUTOVACUUM 007390 if( pPage->pBt->autoVacuum ){ 007391 int rc2 = SQLITE_OK; 007392 /* The cell may contain a pointer to an overflow page. If so, write 007393 ** the entry for the overflow page into the pointer map. 007394 */ 007395 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2); 007396 if( rc2 ) return rc2; 007397 } 007398 #endif 007399 } 007400 return SQLITE_OK; 007401 } 007402 007403 /* 007404 ** The following parameters determine how many adjacent pages get involved 007405 ** in a balancing operation. NN is the number of neighbors on either side 007406 ** of the page that participate in the balancing operation. NB is the 007407 ** total number of pages that participate, including the target page and 007408 ** NN neighbors on either side. 007409 ** 007410 ** The minimum value of NN is 1 (of course). Increasing NN above 1 007411 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 007412 ** in exchange for a larger degradation in INSERT and UPDATE performance. 007413 ** The value of NN appears to give the best results overall. 007414 ** 007415 ** (Later:) The description above makes it seem as if these values are 007416 ** tunable - as if you could change them and recompile and it would all work. 007417 ** But that is unlikely. NB has been 3 since the inception of SQLite and 007418 ** we have never tested any other value. 007419 */ 007420 #define NN 1 /* Number of neighbors on either side of pPage */ 007421 #define NB 3 /* (NN*2+1): Total pages involved in the balance */ 007422 007423 /* 007424 ** A CellArray object contains a cache of pointers and sizes for a 007425 ** consecutive sequence of cells that might be held on multiple pages. 007426 ** 007427 ** The cells in this array are the divider cell or cells from the pParent 007428 ** page plus up to three child pages. There are a total of nCell cells. 007429 ** 007430 ** pRef is a pointer to one of the pages that contributes cells. This is 007431 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize 007432 ** which should be common to all pages that contribute cells to this array. 007433 ** 007434 ** apCell[] and szCell[] hold, respectively, pointers to the start of each 007435 ** cell and the size of each cell. Some of the apCell[] pointers might refer 007436 ** to overflow cells. In other words, some apCel[] pointers might not point 007437 ** to content area of the pages. 007438 ** 007439 ** A szCell[] of zero means the size of that cell has not yet been computed. 007440 ** 007441 ** The cells come from as many as four different pages: 007442 ** 007443 ** ----------- 007444 ** | Parent | 007445 ** ----------- 007446 ** / | \ 007447 ** / | \ 007448 ** --------- --------- --------- 007449 ** |Child-1| |Child-2| |Child-3| 007450 ** --------- --------- --------- 007451 ** 007452 ** The order of cells is in the array is for an index btree is: 007453 ** 007454 ** 1. All cells from Child-1 in order 007455 ** 2. The first divider cell from Parent 007456 ** 3. All cells from Child-2 in order 007457 ** 4. The second divider cell from Parent 007458 ** 5. All cells from Child-3 in order 007459 ** 007460 ** For a table-btree (with rowids) the items 2 and 4 are empty because 007461 ** content exists only in leaves and there are no divider cells. 007462 ** 007463 ** For an index btree, the apEnd[] array holds pointer to the end of page 007464 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3, 007465 ** respectively. The ixNx[] array holds the number of cells contained in 007466 ** each of these 5 stages, and all stages to the left. Hence: 007467 ** 007468 ** ixNx[0] = Number of cells in Child-1. 007469 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider. 007470 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider. 007471 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells 007472 ** ixNx[4] = Total number of cells. 007473 ** 007474 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2] 007475 ** are used and they point to the leaf pages only, and the ixNx value are: 007476 ** 007477 ** ixNx[0] = Number of cells in Child-1. 007478 ** ixNx[1] = Number of cells in Child-1 and Child-2. 007479 ** ixNx[2] = Total number of cells. 007480 ** 007481 ** Sometimes when deleting, a child page can have zero cells. In those 007482 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[] 007483 ** entries, shift down. The end result is that each ixNx[] entry should 007484 ** be larger than the previous 007485 */ 007486 typedef struct CellArray CellArray; 007487 struct CellArray { 007488 int nCell; /* Number of cells in apCell[] */ 007489 MemPage *pRef; /* Reference page */ 007490 u8 **apCell; /* All cells begin balanced */ 007491 u16 *szCell; /* Local size of all cells in apCell[] */ 007492 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */ 007493 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */ 007494 }; 007495 007496 /* 007497 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been 007498 ** computed. 007499 */ 007500 static void populateCellCache(CellArray *p, int idx, int N){ 007501 MemPage *pRef = p->pRef; 007502 u16 *szCell = p->szCell; 007503 assert( idx>=0 && idx+N<=p->nCell ); 007504 while( N>0 ){ 007505 assert( p->apCell[idx]!=0 ); 007506 if( szCell[idx]==0 ){ 007507 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]); 007508 }else{ 007509 assert( CORRUPT_DB || 007510 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) ); 007511 } 007512 idx++; 007513 N--; 007514 } 007515 } 007516 007517 /* 007518 ** Return the size of the Nth element of the cell array 007519 */ 007520 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){ 007521 assert( N>=0 && N<p->nCell ); 007522 assert( p->szCell[N]==0 ); 007523 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]); 007524 return p->szCell[N]; 007525 } 007526 static u16 cachedCellSize(CellArray *p, int N){ 007527 assert( N>=0 && N<p->nCell ); 007528 if( p->szCell[N] ) return p->szCell[N]; 007529 return computeCellSize(p, N); 007530 } 007531 007532 /* 007533 ** Array apCell[] contains pointers to nCell b-tree page cells. The 007534 ** szCell[] array contains the size in bytes of each cell. This function 007535 ** replaces the current contents of page pPg with the contents of the cell 007536 ** array. 007537 ** 007538 ** Some of the cells in apCell[] may currently be stored in pPg. This 007539 ** function works around problems caused by this by making a copy of any 007540 ** such cells before overwriting the page data. 007541 ** 007542 ** The MemPage.nFree field is invalidated by this function. It is the 007543 ** responsibility of the caller to set it correctly. 007544 */ 007545 static int rebuildPage( 007546 CellArray *pCArray, /* Content to be added to page pPg */ 007547 int iFirst, /* First cell in pCArray to use */ 007548 int nCell, /* Final number of cells on page */ 007549 MemPage *pPg /* The page to be reconstructed */ 007550 ){ 007551 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */ 007552 u8 * const aData = pPg->aData; /* Pointer to data for pPg */ 007553 const int usableSize = pPg->pBt->usableSize; 007554 u8 * const pEnd = &aData[usableSize]; 007555 int i = iFirst; /* Which cell to copy from pCArray*/ 007556 u32 j; /* Start of cell content area */ 007557 int iEnd = i+nCell; /* Loop terminator */ 007558 u8 *pCellptr = pPg->aCellIdx; 007559 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007560 u8 *pData; 007561 int k; /* Current slot in pCArray->apEnd[] */ 007562 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */ 007563 007564 assert( nCell>0 ); 007565 assert( i<iEnd ); 007566 j = get2byte(&aData[hdr+5]); 007567 if( j>(u32)usableSize ){ j = 0; } 007568 memcpy(&pTmp[j], &aData[j], usableSize - j); 007569 007570 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i; k++){} 007571 pSrcEnd = pCArray->apEnd[k]; 007572 007573 pData = pEnd; 007574 while( 1/*exit by break*/ ){ 007575 u8 *pCell = pCArray->apCell[i]; 007576 u16 sz = pCArray->szCell[i]; 007577 assert( sz>0 ); 007578 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){ 007579 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT; 007580 pCell = &pTmp[pCell - aData]; 007581 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd 007582 && (uptr)(pCell)<(uptr)pSrcEnd 007583 ){ 007584 return SQLITE_CORRUPT_BKPT; 007585 } 007586 007587 pData -= sz; 007588 put2byte(pCellptr, (pData - aData)); 007589 pCellptr += 2; 007590 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT; 007591 memmove(pData, pCell, sz); 007592 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB ); 007593 i++; 007594 if( i>=iEnd ) break; 007595 if( pCArray->ixNx[k]<=i ){ 007596 k++; 007597 pSrcEnd = pCArray->apEnd[k]; 007598 } 007599 } 007600 007601 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */ 007602 pPg->nCell = nCell; 007603 pPg->nOverflow = 0; 007604 007605 put2byte(&aData[hdr+1], 0); 007606 put2byte(&aData[hdr+3], pPg->nCell); 007607 put2byte(&aData[hdr+5], pData - aData); 007608 aData[hdr+7] = 0x00; 007609 return SQLITE_OK; 007610 } 007611 007612 /* 007613 ** The pCArray objects contains pointers to b-tree cells and the cell sizes. 007614 ** This function attempts to add the cells stored in the array to page pPg. 007615 ** If it cannot (because the page needs to be defragmented before the cells 007616 ** will fit), non-zero is returned. Otherwise, if the cells are added 007617 ** successfully, zero is returned. 007618 ** 007619 ** Argument pCellptr points to the first entry in the cell-pointer array 007620 ** (part of page pPg) to populate. After cell apCell[0] is written to the 007621 ** page body, a 16-bit offset is written to pCellptr. And so on, for each 007622 ** cell in the array. It is the responsibility of the caller to ensure 007623 ** that it is safe to overwrite this part of the cell-pointer array. 007624 ** 007625 ** When this function is called, *ppData points to the start of the 007626 ** content area on page pPg. If the size of the content area is extended, 007627 ** *ppData is updated to point to the new start of the content area 007628 ** before returning. 007629 ** 007630 ** Finally, argument pBegin points to the byte immediately following the 007631 ** end of the space required by this page for the cell-pointer area (for 007632 ** all cells - not just those inserted by the current call). If the content 007633 ** area must be extended to before this point in order to accommodate all 007634 ** cells in apCell[], then the cells do not fit and non-zero is returned. 007635 */ 007636 static int pageInsertArray( 007637 MemPage *pPg, /* Page to add cells to */ 007638 u8 *pBegin, /* End of cell-pointer array */ 007639 u8 **ppData, /* IN/OUT: Page content-area pointer */ 007640 u8 *pCellptr, /* Pointer to cell-pointer area */ 007641 int iFirst, /* Index of first cell to add */ 007642 int nCell, /* Number of cells to add to pPg */ 007643 CellArray *pCArray /* Array of cells */ 007644 ){ 007645 int i = iFirst; /* Loop counter - cell index to insert */ 007646 u8 *aData = pPg->aData; /* Complete page */ 007647 u8 *pData = *ppData; /* Content area. A subset of aData[] */ 007648 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */ 007649 int k; /* Current slot in pCArray->apEnd[] */ 007650 u8 *pEnd; /* Maximum extent of cell data */ 007651 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */ 007652 if( iEnd<=iFirst ) return 0; 007653 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i ; k++){} 007654 pEnd = pCArray->apEnd[k]; 007655 while( 1 /*Exit by break*/ ){ 007656 int sz, rc; 007657 u8 *pSlot; 007658 assert( pCArray->szCell[i]!=0 ); 007659 sz = pCArray->szCell[i]; 007660 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){ 007661 if( (pData - pBegin)<sz ) return 1; 007662 pData -= sz; 007663 pSlot = pData; 007664 } 007665 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed 007666 ** database. But they might for a corrupt database. Hence use memmove() 007667 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */ 007668 assert( (pSlot+sz)<=pCArray->apCell[i] 007669 || pSlot>=(pCArray->apCell[i]+sz) 007670 || CORRUPT_DB ); 007671 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd 007672 && (uptr)(pCArray->apCell[i])<(uptr)pEnd 007673 ){ 007674 assert( CORRUPT_DB ); 007675 (void)SQLITE_CORRUPT_BKPT; 007676 return 1; 007677 } 007678 memmove(pSlot, pCArray->apCell[i], sz); 007679 put2byte(pCellptr, (pSlot - aData)); 007680 pCellptr += 2; 007681 i++; 007682 if( i>=iEnd ) break; 007683 if( pCArray->ixNx[k]<=i ){ 007684 k++; 007685 pEnd = pCArray->apEnd[k]; 007686 } 007687 } 007688 *ppData = pData; 007689 return 0; 007690 } 007691 007692 /* 007693 ** The pCArray object contains pointers to b-tree cells and their sizes. 007694 ** 007695 ** This function adds the space associated with each cell in the array 007696 ** that is currently stored within the body of pPg to the pPg free-list. 007697 ** The cell-pointers and other fields of the page are not updated. 007698 ** 007699 ** This function returns the total number of cells added to the free-list. 007700 */ 007701 static int pageFreeArray( 007702 MemPage *pPg, /* Page to edit */ 007703 int iFirst, /* First cell to delete */ 007704 int nCell, /* Cells to delete */ 007705 CellArray *pCArray /* Array of cells */ 007706 ){ 007707 u8 * const aData = pPg->aData; 007708 u8 * const pEnd = &aData[pPg->pBt->usableSize]; 007709 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize]; 007710 int nRet = 0; 007711 int i, j; 007712 int iEnd = iFirst + nCell; 007713 int nFree = 0; 007714 int aOfst[10]; 007715 int aAfter[10]; 007716 007717 for(i=iFirst; i<iEnd; i++){ 007718 u8 *pCell = pCArray->apCell[i]; 007719 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){ 007720 int sz; 007721 int iAfter; 007722 int iOfst; 007723 /* No need to use cachedCellSize() here. The sizes of all cells that 007724 ** are to be freed have already been computing while deciding which 007725 ** cells need freeing */ 007726 sz = pCArray->szCell[i]; assert( sz>0 ); 007727 iOfst = (u16)(pCell - aData); 007728 iAfter = iOfst+sz; 007729 for(j=0; j<nFree; j++){ 007730 if( aOfst[j]==iAfter ){ 007731 aOfst[j] = iOfst; 007732 break; 007733 }else if( aAfter[j]==iOfst ){ 007734 aAfter[j] = iAfter; 007735 break; 007736 } 007737 } 007738 if( j>=nFree ){ 007739 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){ 007740 for(j=0; j<nFree; j++){ 007741 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007742 } 007743 nFree = 0; 007744 } 007745 aOfst[nFree] = iOfst; 007746 aAfter[nFree] = iAfter; 007747 if( &aData[iAfter]>pEnd ) return 0; 007748 nFree++; 007749 } 007750 nRet++; 007751 } 007752 } 007753 for(j=0; j<nFree; j++){ 007754 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]); 007755 } 007756 return nRet; 007757 } 007758 007759 /* 007760 ** pCArray contains pointers to and sizes of all cells in the page being 007761 ** balanced. The current page, pPg, has pPg->nCell cells starting with 007762 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells 007763 ** starting at apCell[iNew]. 007764 ** 007765 ** This routine makes the necessary adjustments to pPg so that it contains 007766 ** the correct cells after being balanced. 007767 ** 007768 ** The pPg->nFree field is invalid when this function returns. It is the 007769 ** responsibility of the caller to set it correctly. 007770 */ 007771 static int editPage( 007772 MemPage *pPg, /* Edit this page */ 007773 int iOld, /* Index of first cell currently on page */ 007774 int iNew, /* Index of new first cell on page */ 007775 int nNew, /* Final number of cells on page */ 007776 CellArray *pCArray /* Array of cells and sizes */ 007777 ){ 007778 u8 * const aData = pPg->aData; 007779 const int hdr = pPg->hdrOffset; 007780 u8 *pBegin = &pPg->aCellIdx[nNew * 2]; 007781 int nCell = pPg->nCell; /* Cells stored on pPg */ 007782 u8 *pData; 007783 u8 *pCellptr; 007784 int i; 007785 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow; 007786 int iNewEnd = iNew + nNew; 007787 007788 #ifdef SQLITE_DEBUG 007789 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager); 007790 memcpy(pTmp, aData, pPg->pBt->usableSize); 007791 #endif 007792 007793 /* Remove cells from the start and end of the page */ 007794 assert( nCell>=0 ); 007795 if( iOld<iNew ){ 007796 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray); 007797 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT; 007798 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2); 007799 nCell -= nShift; 007800 } 007801 if( iNewEnd < iOldEnd ){ 007802 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray); 007803 assert( nCell>=nTail ); 007804 nCell -= nTail; 007805 } 007806 007807 pData = &aData[get2byte(&aData[hdr+5])]; 007808 if( pData<pBegin ) goto editpage_fail; 007809 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail; 007810 007811 /* Add cells to the start of the page */ 007812 if( iNew<iOld ){ 007813 int nAdd = MIN(nNew,iOld-iNew); 007814 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB ); 007815 assert( nAdd>=0 ); 007816 pCellptr = pPg->aCellIdx; 007817 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2); 007818 if( pageInsertArray( 007819 pPg, pBegin, &pData, pCellptr, 007820 iNew, nAdd, pCArray 007821 ) ) goto editpage_fail; 007822 nCell += nAdd; 007823 } 007824 007825 /* Add any overflow cells */ 007826 for(i=0; i<pPg->nOverflow; i++){ 007827 int iCell = (iOld + pPg->aiOvfl[i]) - iNew; 007828 if( iCell>=0 && iCell<nNew ){ 007829 pCellptr = &pPg->aCellIdx[iCell * 2]; 007830 if( nCell>iCell ){ 007831 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2); 007832 } 007833 nCell++; 007834 cachedCellSize(pCArray, iCell+iNew); 007835 if( pageInsertArray( 007836 pPg, pBegin, &pData, pCellptr, 007837 iCell+iNew, 1, pCArray 007838 ) ) goto editpage_fail; 007839 } 007840 } 007841 007842 /* Append cells to the end of the page */ 007843 assert( nCell>=0 ); 007844 pCellptr = &pPg->aCellIdx[nCell*2]; 007845 if( pageInsertArray( 007846 pPg, pBegin, &pData, pCellptr, 007847 iNew+nCell, nNew-nCell, pCArray 007848 ) ) goto editpage_fail; 007849 007850 pPg->nCell = nNew; 007851 pPg->nOverflow = 0; 007852 007853 put2byte(&aData[hdr+3], pPg->nCell); 007854 put2byte(&aData[hdr+5], pData - aData); 007855 007856 #ifdef SQLITE_DEBUG 007857 for(i=0; i<nNew && !CORRUPT_DB; i++){ 007858 u8 *pCell = pCArray->apCell[i+iNew]; 007859 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]); 007860 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){ 007861 pCell = &pTmp[pCell - aData]; 007862 } 007863 assert( 0==memcmp(pCell, &aData[iOff], 007864 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) ); 007865 } 007866 #endif 007867 007868 return SQLITE_OK; 007869 editpage_fail: 007870 /* Unable to edit this page. Rebuild it from scratch instead. */ 007871 if( nNew<1 ) return SQLITE_CORRUPT_BKPT; 007872 populateCellCache(pCArray, iNew, nNew); 007873 return rebuildPage(pCArray, iNew, nNew, pPg); 007874 } 007875 007876 007877 #ifndef SQLITE_OMIT_QUICKBALANCE 007878 /* 007879 ** This version of balance() handles the common special case where 007880 ** a new entry is being inserted on the extreme right-end of the 007881 ** tree, in other words, when the new entry will become the largest 007882 ** entry in the tree. 007883 ** 007884 ** Instead of trying to balance the 3 right-most leaf pages, just add 007885 ** a new page to the right-hand side and put the one new entry in 007886 ** that page. This leaves the right side of the tree somewhat 007887 ** unbalanced. But odds are that we will be inserting new entries 007888 ** at the end soon afterwards so the nearly empty page will quickly 007889 ** fill up. On average. 007890 ** 007891 ** pPage is the leaf page which is the right-most page in the tree. 007892 ** pParent is its parent. pPage must have a single overflow entry 007893 ** which is also the right-most entry on the page. 007894 ** 007895 ** The pSpace buffer is used to store a temporary copy of the divider 007896 ** cell that will be inserted into pParent. Such a cell consists of a 4 007897 ** byte page number followed by a variable length integer. In other 007898 ** words, at most 13 bytes. Hence the pSpace buffer must be at 007899 ** least 13 bytes in size. 007900 */ 007901 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 007902 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 007903 MemPage *pNew; /* Newly allocated page */ 007904 int rc; /* Return Code */ 007905 Pgno pgnoNew; /* Page number of pNew */ 007906 007907 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 007908 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 007909 assert( pPage->nOverflow==1 ); 007910 007911 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */ 007912 assert( pPage->nFree>=0 ); 007913 assert( pParent->nFree>=0 ); 007914 007915 /* Allocate a new page. This page will become the right-sibling of 007916 ** pPage. Make the parent page writable, so that the new divider cell 007917 ** may be inserted. If both these operations are successful, proceed. 007918 */ 007919 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 007920 007921 if( rc==SQLITE_OK ){ 007922 007923 u8 *pOut = &pSpace[4]; 007924 u8 *pCell = pPage->apOvfl[0]; 007925 u16 szCell = pPage->xCellSize(pPage, pCell); 007926 u8 *pStop; 007927 CellArray b; 007928 007929 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 007930 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 007931 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 007932 b.nCell = 1; 007933 b.pRef = pPage; 007934 b.apCell = &pCell; 007935 b.szCell = &szCell; 007936 b.apEnd[0] = pPage->aDataEnd; 007937 b.ixNx[0] = 2; 007938 rc = rebuildPage(&b, 0, 1, pNew); 007939 if( NEVER(rc) ){ 007940 releasePage(pNew); 007941 return rc; 007942 } 007943 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell; 007944 007945 /* If this is an auto-vacuum database, update the pointer map 007946 ** with entries for the new page, and any pointer from the 007947 ** cell on the page to an overflow page. If either of these 007948 ** operations fails, the return code is set, but the contents 007949 ** of the parent page are still manipulated by the code below. 007950 ** That is Ok, at this point the parent page is guaranteed to 007951 ** be marked as dirty. Returning an error code will cause a 007952 ** rollback, undoing any changes made to the parent page. 007953 */ 007954 if( ISAUTOVACUUM(pBt) ){ 007955 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 007956 if( szCell>pNew->minLocal ){ 007957 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc); 007958 } 007959 } 007960 007961 /* Create a divider cell to insert into pParent. The divider cell 007962 ** consists of a 4-byte page number (the page number of pPage) and 007963 ** a variable length key value (which must be the same value as the 007964 ** largest key on pPage). 007965 ** 007966 ** To find the largest key value on pPage, first find the right-most 007967 ** cell on pPage. The first two fields of this cell are the 007968 ** record-length (a variable length integer at most 32-bits in size) 007969 ** and the key value (a variable length integer, may have any value). 007970 ** The first of the while(...) loops below skips over the record-length 007971 ** field. The second while(...) loop copies the key value from the 007972 ** cell on pPage into the pSpace buffer. 007973 */ 007974 pCell = findCell(pPage, pPage->nCell-1); 007975 pStop = &pCell[9]; 007976 while( (*(pCell++)&0x80) && pCell<pStop ); 007977 pStop = &pCell[9]; 007978 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 007979 007980 /* Insert the new divider cell into pParent. */ 007981 if( rc==SQLITE_OK ){ 007982 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 007983 0, pPage->pgno); 007984 } 007985 007986 /* Set the right-child pointer of pParent to point to the new page. */ 007987 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 007988 007989 /* Release the reference to the new page. */ 007990 releasePage(pNew); 007991 } 007992 007993 return rc; 007994 } 007995 #endif /* SQLITE_OMIT_QUICKBALANCE */ 007996 007997 #if 0 007998 /* 007999 ** This function does not contribute anything to the operation of SQLite. 008000 ** it is sometimes activated temporarily while debugging code responsible 008001 ** for setting pointer-map entries. 008002 */ 008003 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 008004 int i, j; 008005 for(i=0; i<nPage; i++){ 008006 Pgno n; 008007 u8 e; 008008 MemPage *pPage = apPage[i]; 008009 BtShared *pBt = pPage->pBt; 008010 assert( pPage->isInit ); 008011 008012 for(j=0; j<pPage->nCell; j++){ 008013 CellInfo info; 008014 u8 *z; 008015 008016 z = findCell(pPage, j); 008017 pPage->xParseCell(pPage, z, &info); 008018 if( info.nLocal<info.nPayload ){ 008019 Pgno ovfl = get4byte(&z[info.nSize-4]); 008020 ptrmapGet(pBt, ovfl, &e, &n); 008021 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 008022 } 008023 if( !pPage->leaf ){ 008024 Pgno child = get4byte(z); 008025 ptrmapGet(pBt, child, &e, &n); 008026 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 008027 } 008028 } 008029 if( !pPage->leaf ){ 008030 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 008031 ptrmapGet(pBt, child, &e, &n); 008032 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 008033 } 008034 } 008035 return 1; 008036 } 008037 #endif 008038 008039 /* 008040 ** This function is used to copy the contents of the b-tree node stored 008041 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 008042 ** the pointer-map entries for each child page are updated so that the 008043 ** parent page stored in the pointer map is page pTo. If pFrom contained 008044 ** any cells with overflow page pointers, then the corresponding pointer 008045 ** map entries are also updated so that the parent page is page pTo. 008046 ** 008047 ** If pFrom is currently carrying any overflow cells (entries in the 008048 ** MemPage.apOvfl[] array), they are not copied to pTo. 008049 ** 008050 ** Before returning, page pTo is reinitialized using btreeInitPage(). 008051 ** 008052 ** The performance of this function is not critical. It is only used by 008053 ** the balance_shallower() and balance_deeper() procedures, neither of 008054 ** which are called often under normal circumstances. 008055 */ 008056 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 008057 if( (*pRC)==SQLITE_OK ){ 008058 BtShared * const pBt = pFrom->pBt; 008059 u8 * const aFrom = pFrom->aData; 008060 u8 * const aTo = pTo->aData; 008061 int const iFromHdr = pFrom->hdrOffset; 008062 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 008063 int rc; 008064 int iData; 008065 008066 008067 assert( pFrom->isInit ); 008068 assert( pFrom->nFree>=iToHdr ); 008069 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 008070 008071 /* Copy the b-tree node content from page pFrom to page pTo. */ 008072 iData = get2byte(&aFrom[iFromHdr+5]); 008073 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 008074 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 008075 008076 /* Reinitialize page pTo so that the contents of the MemPage structure 008077 ** match the new data. The initialization of pTo can actually fail under 008078 ** fairly obscure circumstances, even though it is a copy of initialized 008079 ** page pFrom. 008080 */ 008081 pTo->isInit = 0; 008082 rc = btreeInitPage(pTo); 008083 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo); 008084 if( rc!=SQLITE_OK ){ 008085 *pRC = rc; 008086 return; 008087 } 008088 008089 /* If this is an auto-vacuum database, update the pointer-map entries 008090 ** for any b-tree or overflow pages that pTo now contains the pointers to. 008091 */ 008092 if( ISAUTOVACUUM(pBt) ){ 008093 *pRC = setChildPtrmaps(pTo); 008094 } 008095 } 008096 } 008097 008098 /* 008099 ** This routine redistributes cells on the iParentIdx'th child of pParent 008100 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 008101 ** same amount of free space. Usually a single sibling on either side of the 008102 ** page are used in the balancing, though both siblings might come from one 008103 ** side if the page is the first or last child of its parent. If the page 008104 ** has fewer than 2 siblings (something which can only happen if the page 008105 ** is a root page or a child of a root page) then all available siblings 008106 ** participate in the balancing. 008107 ** 008108 ** The number of siblings of the page might be increased or decreased by 008109 ** one or two in an effort to keep pages nearly full but not over full. 008110 ** 008111 ** Note that when this routine is called, some of the cells on the page 008112 ** might not actually be stored in MemPage.aData[]. This can happen 008113 ** if the page is overfull. This routine ensures that all cells allocated 008114 ** to the page and its siblings fit into MemPage.aData[] before returning. 008115 ** 008116 ** In the course of balancing the page and its siblings, cells may be 008117 ** inserted into or removed from the parent page (pParent). Doing so 008118 ** may cause the parent page to become overfull or underfull. If this 008119 ** happens, it is the responsibility of the caller to invoke the correct 008120 ** balancing routine to fix this problem (see the balance() routine). 008121 ** 008122 ** If this routine fails for any reason, it might leave the database 008123 ** in a corrupted state. So if this routine fails, the database should 008124 ** be rolled back. 008125 ** 008126 ** The third argument to this function, aOvflSpace, is a pointer to a 008127 ** buffer big enough to hold one page. If while inserting cells into the parent 008128 ** page (pParent) the parent page becomes overfull, this buffer is 008129 ** used to store the parent's overflow cells. Because this function inserts 008130 ** a maximum of four divider cells into the parent page, and the maximum 008131 ** size of a cell stored within an internal node is always less than 1/4 008132 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 008133 ** enough for all overflow cells. 008134 ** 008135 ** If aOvflSpace is set to a null pointer, this function returns 008136 ** SQLITE_NOMEM. 008137 */ 008138 static int balance_nonroot( 008139 MemPage *pParent, /* Parent page of siblings being balanced */ 008140 int iParentIdx, /* Index of "the page" in pParent */ 008141 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 008142 int isRoot, /* True if pParent is a root-page */ 008143 int bBulk /* True if this call is part of a bulk load */ 008144 ){ 008145 BtShared *pBt; /* The whole database */ 008146 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 008147 int nNew = 0; /* Number of pages in apNew[] */ 008148 int nOld; /* Number of pages in apOld[] */ 008149 int i, j, k; /* Loop counters */ 008150 int nxDiv; /* Next divider slot in pParent->aCell[] */ 008151 int rc = SQLITE_OK; /* The return code */ 008152 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 008153 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 008154 int usableSpace; /* Bytes in pPage beyond the header */ 008155 int pageFlags; /* Value of pPage->aData[0] */ 008156 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 008157 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 008158 int szScratch; /* Size of scratch memory requested */ 008159 MemPage *apOld[NB]; /* pPage and up to two siblings */ 008160 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 008161 u8 *pRight; /* Location in parent of right-sibling pointer */ 008162 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 008163 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */ 008164 int cntOld[NB+2]; /* Old index in b.apCell[] */ 008165 int szNew[NB+2]; /* Combined size of cells placed on i-th page */ 008166 u8 *aSpace1; /* Space for copies of dividers cells */ 008167 Pgno pgno; /* Temp var to store a page number in */ 008168 u8 abDone[NB+2]; /* True after i'th new page is populated */ 008169 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ 008170 CellArray b; /* Parsed information on cells being balanced */ 008171 008172 memset(abDone, 0, sizeof(abDone)); 008173 memset(&b, 0, sizeof(b)); 008174 pBt = pParent->pBt; 008175 assert( sqlite3_mutex_held(pBt->mutex) ); 008176 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008177 008178 /* At this point pParent may have at most one overflow cell. And if 008179 ** this overflow cell is present, it must be the cell with 008180 ** index iParentIdx. This scenario comes about when this function 008181 ** is called (indirectly) from sqlite3BtreeDelete(). 008182 */ 008183 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 008184 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 008185 008186 if( !aOvflSpace ){ 008187 return SQLITE_NOMEM_BKPT; 008188 } 008189 assert( pParent->nFree>=0 ); 008190 008191 /* Find the sibling pages to balance. Also locate the cells in pParent 008192 ** that divide the siblings. An attempt is made to find NN siblings on 008193 ** either side of pPage. More siblings are taken from one side, however, 008194 ** if there are fewer than NN siblings on the other side. If pParent 008195 ** has NB or fewer children then all children of pParent are taken. 008196 ** 008197 ** This loop also drops the divider cells from the parent page. This 008198 ** way, the remainder of the function does not have to deal with any 008199 ** overflow cells in the parent page, since if any existed they will 008200 ** have already been removed. 008201 */ 008202 i = pParent->nOverflow + pParent->nCell; 008203 if( i<2 ){ 008204 nxDiv = 0; 008205 }else{ 008206 assert( bBulk==0 || bBulk==1 ); 008207 if( iParentIdx==0 ){ 008208 nxDiv = 0; 008209 }else if( iParentIdx==i ){ 008210 nxDiv = i-2+bBulk; 008211 }else{ 008212 nxDiv = iParentIdx-1; 008213 } 008214 i = 2-bBulk; 008215 } 008216 nOld = i+1; 008217 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 008218 pRight = &pParent->aData[pParent->hdrOffset+8]; 008219 }else{ 008220 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 008221 } 008222 pgno = get4byte(pRight); 008223 while( 1 ){ 008224 if( rc==SQLITE_OK ){ 008225 rc = getAndInitPage(pBt, pgno, &apOld[i], 0); 008226 } 008227 if( rc ){ 008228 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 008229 goto balance_cleanup; 008230 } 008231 if( apOld[i]->nFree<0 ){ 008232 rc = btreeComputeFreeSpace(apOld[i]); 008233 if( rc ){ 008234 memset(apOld, 0, (i)*sizeof(MemPage*)); 008235 goto balance_cleanup; 008236 } 008237 } 008238 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl); 008239 if( (i--)==0 ) break; 008240 008241 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){ 008242 apDiv[i] = pParent->apOvfl[0]; 008243 pgno = get4byte(apDiv[i]); 008244 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008245 pParent->nOverflow = 0; 008246 }else{ 008247 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 008248 pgno = get4byte(apDiv[i]); 008249 szNew[i] = pParent->xCellSize(pParent, apDiv[i]); 008250 008251 /* Drop the cell from the parent page. apDiv[i] still points to 008252 ** the cell within the parent, even though it has been dropped. 008253 ** This is safe because dropping a cell only overwrites the first 008254 ** four bytes of it, and this function does not need the first 008255 ** four bytes of the divider cell. So the pointer is safe to use 008256 ** later on. 008257 ** 008258 ** But not if we are in secure-delete mode. In secure-delete mode, 008259 ** the dropCell() routine will overwrite the entire cell with zeroes. 008260 ** In this case, temporarily copy the cell into the aOvflSpace[] 008261 ** buffer. It will be copied out again as soon as the aSpace[] buffer 008262 ** is allocated. */ 008263 if( pBt->btsFlags & BTS_FAST_SECURE ){ 008264 int iOff; 008265 008266 /* If the following if() condition is not true, the db is corrupted. 008267 ** The call to dropCell() below will detect this. */ 008268 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 008269 if( (iOff+szNew[i])<=(int)pBt->usableSize ){ 008270 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 008271 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 008272 } 008273 } 008274 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 008275 } 008276 } 008277 008278 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 008279 ** alignment */ 008280 nMaxCells = (nMaxCells + 3)&~3; 008281 008282 /* 008283 ** Allocate space for memory structures 008284 */ 008285 szScratch = 008286 nMaxCells*sizeof(u8*) /* b.apCell */ 008287 + nMaxCells*sizeof(u16) /* b.szCell */ 008288 + pBt->pageSize; /* aSpace1 */ 008289 008290 assert( szScratch<=7*(int)pBt->pageSize ); 008291 b.apCell = sqlite3StackAllocRaw(0, szScratch ); 008292 if( b.apCell==0 ){ 008293 rc = SQLITE_NOMEM_BKPT; 008294 goto balance_cleanup; 008295 } 008296 b.szCell = (u16*)&b.apCell[nMaxCells]; 008297 aSpace1 = (u8*)&b.szCell[nMaxCells]; 008298 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 008299 008300 /* 008301 ** Load pointers to all cells on sibling pages and the divider cells 008302 ** into the local b.apCell[] array. Make copies of the divider cells 008303 ** into space obtained from aSpace1[]. The divider cells have already 008304 ** been removed from pParent. 008305 ** 008306 ** If the siblings are on leaf pages, then the child pointers of the 008307 ** divider cells are stripped from the cells before they are copied 008308 ** into aSpace1[]. In this way, all cells in b.apCell[] are without 008309 ** child pointers. If siblings are not leaves, then all cell in 008310 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[] 008311 ** are alike. 008312 ** 008313 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 008314 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 008315 */ 008316 b.pRef = apOld[0]; 008317 leafCorrection = b.pRef->leaf*4; 008318 leafData = b.pRef->intKeyLeaf; 008319 for(i=0; i<nOld; i++){ 008320 MemPage *pOld = apOld[i]; 008321 int limit = pOld->nCell; 008322 u8 *aData = pOld->aData; 008323 u16 maskPage = pOld->maskPage; 008324 u8 *piCell = aData + pOld->cellOffset; 008325 u8 *piEnd; 008326 VVA_ONLY( int nCellAtStart = b.nCell; ) 008327 008328 /* Verify that all sibling pages are of the same "type" (table-leaf, 008329 ** table-interior, index-leaf, or index-interior). 008330 */ 008331 if( pOld->aData[0]!=apOld[0]->aData[0] ){ 008332 rc = SQLITE_CORRUPT_PAGE(pOld); 008333 goto balance_cleanup; 008334 } 008335 008336 /* Load b.apCell[] with pointers to all cells in pOld. If pOld 008337 ** contains overflow cells, include them in the b.apCell[] array 008338 ** in the correct spot. 008339 ** 008340 ** Note that when there are multiple overflow cells, it is always the 008341 ** case that they are sequential and adjacent. This invariant arises 008342 ** because multiple overflows can only occurs when inserting divider 008343 ** cells into a parent on a prior balance, and divider cells are always 008344 ** adjacent and are inserted in order. There is an assert() tagged 008345 ** with "NOTE 1" in the overflow cell insertion loop to prove this 008346 ** invariant. 008347 ** 008348 ** This must be done in advance. Once the balance starts, the cell 008349 ** offset section of the btree page will be overwritten and we will no 008350 ** long be able to find the cells if a pointer to each cell is not saved 008351 ** first. 008352 */ 008353 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow)); 008354 if( pOld->nOverflow>0 ){ 008355 if( NEVER(limit<pOld->aiOvfl[0]) ){ 008356 rc = SQLITE_CORRUPT_PAGE(pOld); 008357 goto balance_cleanup; 008358 } 008359 limit = pOld->aiOvfl[0]; 008360 for(j=0; j<limit; j++){ 008361 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008362 piCell += 2; 008363 b.nCell++; 008364 } 008365 for(k=0; k<pOld->nOverflow; k++){ 008366 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */ 008367 b.apCell[b.nCell] = pOld->apOvfl[k]; 008368 b.nCell++; 008369 } 008370 } 008371 piEnd = aData + pOld->cellOffset + 2*pOld->nCell; 008372 while( piCell<piEnd ){ 008373 assert( b.nCell<nMaxCells ); 008374 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell)); 008375 piCell += 2; 008376 b.nCell++; 008377 } 008378 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) ); 008379 008380 cntOld[i] = b.nCell; 008381 if( i<nOld-1 && !leafData){ 008382 u16 sz = (u16)szNew[i]; 008383 u8 *pTemp; 008384 assert( b.nCell<nMaxCells ); 008385 b.szCell[b.nCell] = sz; 008386 pTemp = &aSpace1[iSpace1]; 008387 iSpace1 += sz; 008388 assert( sz<=pBt->maxLocal+23 ); 008389 assert( iSpace1 <= (int)pBt->pageSize ); 008390 memcpy(pTemp, apDiv[i], sz); 008391 b.apCell[b.nCell] = pTemp+leafCorrection; 008392 assert( leafCorrection==0 || leafCorrection==4 ); 008393 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection; 008394 if( !pOld->leaf ){ 008395 assert( leafCorrection==0 ); 008396 assert( pOld->hdrOffset==0 || CORRUPT_DB ); 008397 /* The right pointer of the child page pOld becomes the left 008398 ** pointer of the divider cell */ 008399 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4); 008400 }else{ 008401 assert( leafCorrection==4 ); 008402 while( b.szCell[b.nCell]<4 ){ 008403 /* Do not allow any cells smaller than 4 bytes. If a smaller cell 008404 ** does exist, pad it with 0x00 bytes. */ 008405 assert( b.szCell[b.nCell]==3 || CORRUPT_DB ); 008406 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB ); 008407 aSpace1[iSpace1++] = 0x00; 008408 b.szCell[b.nCell]++; 008409 } 008410 } 008411 b.nCell++; 008412 } 008413 } 008414 008415 /* 008416 ** Figure out the number of pages needed to hold all b.nCell cells. 008417 ** Store this number in "k". Also compute szNew[] which is the total 008418 ** size of all cells on the i-th page and cntNew[] which is the index 008419 ** in b.apCell[] of the cell that divides page i from page i+1. 008420 ** cntNew[k] should equal b.nCell. 008421 ** 008422 ** Values computed by this block: 008423 ** 008424 ** k: The total number of sibling pages 008425 ** szNew[i]: Spaced used on the i-th sibling page. 008426 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to 008427 ** the right of the i-th sibling page. 008428 ** usableSpace: Number of bytes of space available on each sibling. 008429 ** 008430 */ 008431 usableSpace = pBt->usableSize - 12 + leafCorrection; 008432 for(i=k=0; i<nOld; i++, k++){ 008433 MemPage *p = apOld[i]; 008434 b.apEnd[k] = p->aDataEnd; 008435 b.ixNx[k] = cntOld[i]; 008436 if( k && b.ixNx[k]==b.ixNx[k-1] ){ 008437 k--; /* Omit b.ixNx[] entry for child pages with no cells */ 008438 } 008439 if( !leafData ){ 008440 k++; 008441 b.apEnd[k] = pParent->aDataEnd; 008442 b.ixNx[k] = cntOld[i]+1; 008443 } 008444 assert( p->nFree>=0 ); 008445 szNew[i] = usableSpace - p->nFree; 008446 for(j=0; j<p->nOverflow; j++){ 008447 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]); 008448 } 008449 cntNew[i] = cntOld[i]; 008450 } 008451 k = nOld; 008452 for(i=0; i<k; i++){ 008453 int sz; 008454 while( szNew[i]>usableSpace ){ 008455 if( i+1>=k ){ 008456 k = i+2; 008457 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 008458 szNew[k-1] = 0; 008459 cntNew[k-1] = b.nCell; 008460 } 008461 sz = 2 + cachedCellSize(&b, cntNew[i]-1); 008462 szNew[i] -= sz; 008463 if( !leafData ){ 008464 if( cntNew[i]<b.nCell ){ 008465 sz = 2 + cachedCellSize(&b, cntNew[i]); 008466 }else{ 008467 sz = 0; 008468 } 008469 } 008470 szNew[i+1] += sz; 008471 cntNew[i]--; 008472 } 008473 while( cntNew[i]<b.nCell ){ 008474 sz = 2 + cachedCellSize(&b, cntNew[i]); 008475 if( szNew[i]+sz>usableSpace ) break; 008476 szNew[i] += sz; 008477 cntNew[i]++; 008478 if( !leafData ){ 008479 if( cntNew[i]<b.nCell ){ 008480 sz = 2 + cachedCellSize(&b, cntNew[i]); 008481 }else{ 008482 sz = 0; 008483 } 008484 } 008485 szNew[i+1] -= sz; 008486 } 008487 if( cntNew[i]>=b.nCell ){ 008488 k = i+1; 008489 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){ 008490 rc = SQLITE_CORRUPT_BKPT; 008491 goto balance_cleanup; 008492 } 008493 } 008494 008495 /* 008496 ** The packing computed by the previous block is biased toward the siblings 008497 ** on the left side (siblings with smaller keys). The left siblings are 008498 ** always nearly full, while the right-most sibling might be nearly empty. 008499 ** The next block of code attempts to adjust the packing of siblings to 008500 ** get a better balance. 008501 ** 008502 ** This adjustment is more than an optimization. The packing above might 008503 ** be so out of balance as to be illegal. For example, the right-most 008504 ** sibling might be completely empty. This adjustment is not optional. 008505 */ 008506 for(i=k-1; i>0; i--){ 008507 int szRight = szNew[i]; /* Size of sibling on the right */ 008508 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 008509 int r; /* Index of right-most cell in left sibling */ 008510 int d; /* Index of first cell to the left of right sibling */ 008511 008512 r = cntNew[i-1] - 1; 008513 d = r + 1 - leafData; 008514 (void)cachedCellSize(&b, d); 008515 do{ 008516 int szR, szD; 008517 assert( d<nMaxCells ); 008518 assert( r<nMaxCells ); 008519 szR = cachedCellSize(&b, r); 008520 szD = b.szCell[d]; 008521 if( szRight!=0 008522 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){ 008523 break; 008524 } 008525 szRight += szD + 2; 008526 szLeft -= szR + 2; 008527 cntNew[i-1] = r; 008528 r--; 008529 d--; 008530 }while( r>=0 ); 008531 szNew[i] = szRight; 008532 szNew[i-1] = szLeft; 008533 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){ 008534 rc = SQLITE_CORRUPT_BKPT; 008535 goto balance_cleanup; 008536 } 008537 } 008538 008539 /* Sanity check: For a non-corrupt database file one of the following 008540 ** must be true: 008541 ** (1) We found one or more cells (cntNew[0])>0), or 008542 ** (2) pPage is a virtual root page. A virtual root page is when 008543 ** the real root page is page 1 and we are the only child of 008544 ** that page. 008545 */ 008546 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB); 008547 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n", 008548 apOld[0]->pgno, apOld[0]->nCell, 008549 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0, 008550 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0 008551 )); 008552 008553 /* 008554 ** Allocate k new pages. Reuse old pages where possible. 008555 */ 008556 pageFlags = apOld[0]->aData[0]; 008557 for(i=0; i<k; i++){ 008558 MemPage *pNew; 008559 if( i<nOld ){ 008560 pNew = apNew[i] = apOld[i]; 008561 apOld[i] = 0; 008562 rc = sqlite3PagerWrite(pNew->pDbPage); 008563 nNew++; 008564 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv)) 008565 && rc==SQLITE_OK 008566 ){ 008567 rc = SQLITE_CORRUPT_BKPT; 008568 } 008569 if( rc ) goto balance_cleanup; 008570 }else{ 008571 assert( i>0 ); 008572 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 008573 if( rc ) goto balance_cleanup; 008574 zeroPage(pNew, pageFlags); 008575 apNew[i] = pNew; 008576 nNew++; 008577 cntOld[i] = b.nCell; 008578 008579 /* Set the pointer-map entry for the new sibling page. */ 008580 if( ISAUTOVACUUM(pBt) ){ 008581 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 008582 if( rc!=SQLITE_OK ){ 008583 goto balance_cleanup; 008584 } 008585 } 008586 } 008587 } 008588 008589 /* 008590 ** Reassign page numbers so that the new pages are in ascending order. 008591 ** This helps to keep entries in the disk file in order so that a scan 008592 ** of the table is closer to a linear scan through the file. That in turn 008593 ** helps the operating system to deliver pages from the disk more rapidly. 008594 ** 008595 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 008596 ** (5), that is not a performance concern. 008597 ** 008598 ** When NB==3, this one optimization makes the database about 25% faster 008599 ** for large insertions and deletions. 008600 */ 008601 for(i=0; i<nNew; i++){ 008602 aPgno[i] = apNew[i]->pgno; 008603 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); 008604 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); 008605 } 008606 for(i=0; i<nNew-1; i++){ 008607 int iB = i; 008608 for(j=i+1; j<nNew; j++){ 008609 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j; 008610 } 008611 008612 /* If apNew[i] has a page number that is bigger than any of the 008613 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent 008614 ** entry that has the smallest page number (which we know to be 008615 ** entry apNew[iB]). 008616 */ 008617 if( iB!=i ){ 008618 Pgno pgnoA = apNew[i]->pgno; 008619 Pgno pgnoB = apNew[iB]->pgno; 008620 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; 008621 u16 fgA = apNew[i]->pDbPage->flags; 008622 u16 fgB = apNew[iB]->pDbPage->flags; 008623 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); 008624 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); 008625 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); 008626 apNew[i]->pgno = pgnoB; 008627 apNew[iB]->pgno = pgnoA; 008628 } 008629 } 008630 008631 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) " 008632 "%u(%u nc=%u) %u(%u nc=%u)\n", 008633 apNew[0]->pgno, szNew[0], cntNew[0], 008634 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 008635 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0, 008636 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 008637 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0, 008638 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 008639 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0, 008640 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0, 008641 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0 008642 )); 008643 008644 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008645 assert( nNew>=1 && nNew<=ArraySize(apNew) ); 008646 assert( apNew[nNew-1]!=0 ); 008647 put4byte(pRight, apNew[nNew-1]->pgno); 008648 008649 /* If the sibling pages are not leaves, ensure that the right-child pointer 008650 ** of the right-most new sibling page is set to the value that was 008651 ** originally in the same field of the right-most old sibling page. */ 008652 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){ 008653 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1]; 008654 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4); 008655 } 008656 008657 /* Make any required updates to pointer map entries associated with 008658 ** cells stored on sibling pages following the balance operation. Pointer 008659 ** map entries associated with divider cells are set by the insertCell() 008660 ** routine. The associated pointer map entries are: 008661 ** 008662 ** a) if the cell contains a reference to an overflow chain, the 008663 ** entry associated with the first page in the overflow chain, and 008664 ** 008665 ** b) if the sibling pages are not leaves, the child page associated 008666 ** with the cell. 008667 ** 008668 ** If the sibling pages are not leaves, then the pointer map entry 008669 ** associated with the right-child of each sibling may also need to be 008670 ** updated. This happens below, after the sibling pages have been 008671 ** populated, not here. 008672 */ 008673 if( ISAUTOVACUUM(pBt) ){ 008674 MemPage *pOld; 008675 MemPage *pNew = pOld = apNew[0]; 008676 int cntOldNext = pNew->nCell + pNew->nOverflow; 008677 int iNew = 0; 008678 int iOld = 0; 008679 008680 for(i=0; i<b.nCell; i++){ 008681 u8 *pCell = b.apCell[i]; 008682 while( i==cntOldNext ){ 008683 iOld++; 008684 assert( iOld<nNew || iOld<nOld ); 008685 assert( iOld>=0 && iOld<NB ); 008686 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld]; 008687 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData; 008688 } 008689 if( i==cntNew[iNew] ){ 008690 pNew = apNew[++iNew]; 008691 if( !leafData ) continue; 008692 } 008693 008694 /* Cell pCell is destined for new sibling page pNew. Originally, it 008695 ** was either part of sibling page iOld (possibly an overflow cell), 008696 ** or else the divider cell to the left of sibling page iOld. So, 008697 ** if sibling page iOld had the same page number as pNew, and if 008698 ** pCell really was a part of sibling page iOld (not a divider or 008699 ** overflow cell), we can skip updating the pointer map entries. */ 008700 if( iOld>=nNew 008701 || pNew->pgno!=aPgno[iOld] 008702 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd) 008703 ){ 008704 if( !leafCorrection ){ 008705 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc); 008706 } 008707 if( cachedCellSize(&b,i)>pNew->minLocal ){ 008708 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc); 008709 } 008710 if( rc ) goto balance_cleanup; 008711 } 008712 } 008713 } 008714 008715 /* Insert new divider cells into pParent. */ 008716 for(i=0; i<nNew-1; i++){ 008717 u8 *pCell; 008718 u8 *pTemp; 008719 int sz; 008720 u8 *pSrcEnd; 008721 MemPage *pNew = apNew[i]; 008722 j = cntNew[i]; 008723 008724 assert( j<nMaxCells ); 008725 assert( b.apCell[j]!=0 ); 008726 pCell = b.apCell[j]; 008727 sz = b.szCell[j] + leafCorrection; 008728 pTemp = &aOvflSpace[iOvflSpace]; 008729 if( !pNew->leaf ){ 008730 memcpy(&pNew->aData[8], pCell, 4); 008731 }else if( leafData ){ 008732 /* If the tree is a leaf-data tree, and the siblings are leaves, 008733 ** then there is no divider cell in b.apCell[]. Instead, the divider 008734 ** cell consists of the integer key for the right-most cell of 008735 ** the sibling-page assembled above only. 008736 */ 008737 CellInfo info; 008738 j--; 008739 pNew->xParseCell(pNew, b.apCell[j], &info); 008740 pCell = pTemp; 008741 sz = 4 + putVarint(&pCell[4], info.nKey); 008742 pTemp = 0; 008743 }else{ 008744 pCell -= 4; 008745 /* Obscure case for non-leaf-data trees: If the cell at pCell was 008746 ** previously stored on a leaf node, and its reported size was 4 008747 ** bytes, then it may actually be smaller than this 008748 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 008749 ** any cell). But it is important to pass the correct size to 008750 ** insertCell(), so reparse the cell now. 008751 ** 008752 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)" 008753 ** and WITHOUT ROWID tables with exactly one column which is the 008754 ** primary key. 008755 */ 008756 if( b.szCell[j]==4 ){ 008757 assert(leafCorrection==4); 008758 sz = pParent->xCellSize(pParent, pCell); 008759 } 008760 } 008761 iOvflSpace += sz; 008762 assert( sz<=pBt->maxLocal+23 ); 008763 assert( iOvflSpace <= (int)pBt->pageSize ); 008764 for(k=0; ALWAYS(k<NB*2) && b.ixNx[k]<=j; k++){} 008765 pSrcEnd = b.apEnd[k]; 008766 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){ 008767 rc = SQLITE_CORRUPT_BKPT; 008768 goto balance_cleanup; 008769 } 008770 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno); 008771 if( rc!=SQLITE_OK ) goto balance_cleanup; 008772 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 008773 } 008774 008775 /* Now update the actual sibling pages. The order in which they are updated 008776 ** is important, as this code needs to avoid disrupting any page from which 008777 ** cells may still to be read. In practice, this means: 008778 ** 008779 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1]) 008780 ** then it is not safe to update page apNew[iPg] until after 008781 ** the left-hand sibling apNew[iPg-1] has been updated. 008782 ** 008783 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1]) 008784 ** then it is not safe to update page apNew[iPg] until after 008785 ** the right-hand sibling apNew[iPg+1] has been updated. 008786 ** 008787 ** If neither of the above apply, the page is safe to update. 008788 ** 008789 ** The iPg value in the following loop starts at nNew-1 goes down 008790 ** to 0, then back up to nNew-1 again, thus making two passes over 008791 ** the pages. On the initial downward pass, only condition (1) above 008792 ** needs to be tested because (2) will always be true from the previous 008793 ** step. On the upward pass, both conditions are always true, so the 008794 ** upwards pass simply processes pages that were missed on the downward 008795 ** pass. 008796 */ 008797 for(i=1-nNew; i<nNew; i++){ 008798 int iPg = i<0 ? -i : i; 008799 assert( iPg>=0 && iPg<nNew ); 008800 assert( iPg>=1 || i>=0 ); 008801 assert( iPg<ArraySize(cntOld) ); 008802 if( abDone[iPg] ) continue; /* Skip pages already processed */ 008803 if( i>=0 /* On the upwards pass, or... */ 008804 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */ 008805 ){ 008806 int iNew; 008807 int iOld; 008808 int nNewCell; 008809 008810 /* Verify condition (1): If cells are moving left, update iPg 008811 ** only after iPg-1 has already been updated. */ 008812 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] ); 008813 008814 /* Verify condition (2): If cells are moving right, update iPg 008815 ** only after iPg+1 has already been updated. */ 008816 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] ); 008817 008818 if( iPg==0 ){ 008819 iNew = iOld = 0; 008820 nNewCell = cntNew[0]; 008821 }else{ 008822 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell; 008823 iNew = cntNew[iPg-1] + !leafData; 008824 nNewCell = cntNew[iPg] - iNew; 008825 } 008826 008827 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b); 008828 if( rc ) goto balance_cleanup; 008829 abDone[iPg]++; 008830 apNew[iPg]->nFree = usableSpace-szNew[iPg]; 008831 assert( apNew[iPg]->nOverflow==0 ); 008832 assert( apNew[iPg]->nCell==nNewCell ); 008833 } 008834 } 008835 008836 /* All pages have been processed exactly once */ 008837 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 ); 008838 008839 assert( nOld>0 ); 008840 assert( nNew>0 ); 008841 008842 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 008843 /* The root page of the b-tree now contains no cells. The only sibling 008844 ** page is the right-child of the parent. Copy the contents of the 008845 ** child page into the parent, decreasing the overall height of the 008846 ** b-tree structure by one. This is described as the "balance-shallower" 008847 ** sub-algorithm in some documentation. 008848 ** 008849 ** If this is an auto-vacuum database, the call to copyNodeContent() 008850 ** sets all pointer-map entries corresponding to database image pages 008851 ** for which the pointer is stored within the content being copied. 008852 ** 008853 ** It is critical that the child page be defragmented before being 008854 ** copied into the parent, because if the parent is page 1 then it will 008855 ** by smaller than the child due to the database header, and so all the 008856 ** free space needs to be up front. 008857 */ 008858 assert( nNew==1 || CORRUPT_DB ); 008859 rc = defragmentPage(apNew[0], -1); 008860 testcase( rc!=SQLITE_OK ); 008861 assert( apNew[0]->nFree == 008862 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset 008863 - apNew[0]->nCell*2) 008864 || rc!=SQLITE_OK 008865 ); 008866 copyNodeContent(apNew[0], pParent, &rc); 008867 freePage(apNew[0], &rc); 008868 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){ 008869 /* Fix the pointer map entries associated with the right-child of each 008870 ** sibling page. All other pointer map entries have already been taken 008871 ** care of. */ 008872 for(i=0; i<nNew; i++){ 008873 u32 key = get4byte(&apNew[i]->aData[8]); 008874 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 008875 } 008876 } 008877 008878 assert( pParent->isInit ); 008879 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n", 008880 nOld, nNew, b.nCell)); 008881 008882 /* Free any old pages that were not reused as new pages. 008883 */ 008884 for(i=nNew; i<nOld; i++){ 008885 freePage(apOld[i], &rc); 008886 } 008887 008888 #if 0 008889 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){ 008890 /* The ptrmapCheckPages() contains assert() statements that verify that 008891 ** all pointer map pages are set correctly. This is helpful while 008892 ** debugging. This is usually disabled because a corrupt database may 008893 ** cause an assert() statement to fail. */ 008894 ptrmapCheckPages(apNew, nNew); 008895 ptrmapCheckPages(&pParent, 1); 008896 } 008897 #endif 008898 008899 /* 008900 ** Cleanup before returning. 008901 */ 008902 balance_cleanup: 008903 sqlite3StackFree(0, b.apCell); 008904 for(i=0; i<nOld; i++){ 008905 releasePage(apOld[i]); 008906 } 008907 for(i=0; i<nNew; i++){ 008908 releasePage(apNew[i]); 008909 } 008910 008911 return rc; 008912 } 008913 008914 008915 /* 008916 ** This function is called when the root page of a b-tree structure is 008917 ** overfull (has one or more overflow pages). 008918 ** 008919 ** A new child page is allocated and the contents of the current root 008920 ** page, including overflow cells, are copied into the child. The root 008921 ** page is then overwritten to make it an empty page with the right-child 008922 ** pointer pointing to the new page. 008923 ** 008924 ** Before returning, all pointer-map entries corresponding to pages 008925 ** that the new child-page now contains pointers to are updated. The 008926 ** entry corresponding to the new right-child pointer of the root 008927 ** page is also updated. 008928 ** 008929 ** If successful, *ppChild is set to contain a reference to the child 008930 ** page and SQLITE_OK is returned. In this case the caller is required 008931 ** to call releasePage() on *ppChild exactly once. If an error occurs, 008932 ** an error code is returned and *ppChild is set to 0. 008933 */ 008934 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 008935 int rc; /* Return value from subprocedures */ 008936 MemPage *pChild = 0; /* Pointer to a new child page */ 008937 Pgno pgnoChild = 0; /* Page number of the new child page */ 008938 BtShared *pBt = pRoot->pBt; /* The BTree */ 008939 008940 assert( pRoot->nOverflow>0 ); 008941 assert( sqlite3_mutex_held(pBt->mutex) ); 008942 008943 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 008944 ** page that will become the new right-child of pPage. Copy the contents 008945 ** of the node stored on pRoot into the new child page. 008946 */ 008947 rc = sqlite3PagerWrite(pRoot->pDbPage); 008948 if( rc==SQLITE_OK ){ 008949 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 008950 copyNodeContent(pRoot, pChild, &rc); 008951 if( ISAUTOVACUUM(pBt) ){ 008952 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 008953 } 008954 } 008955 if( rc ){ 008956 *ppChild = 0; 008957 releasePage(pChild); 008958 return rc; 008959 } 008960 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 008961 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 008962 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB ); 008963 008964 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno)); 008965 008966 /* Copy the overflow cells from pRoot to pChild */ 008967 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 008968 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 008969 memcpy(pChild->apOvfl, pRoot->apOvfl, 008970 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 008971 pChild->nOverflow = pRoot->nOverflow; 008972 008973 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 008974 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 008975 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 008976 008977 *ppChild = pChild; 008978 return SQLITE_OK; 008979 } 008980 008981 /* 008982 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid 008983 ** on the same B-tree as pCur. 008984 ** 008985 ** This can occur if a database is corrupt with two or more SQL tables 008986 ** pointing to the same b-tree. If an insert occurs on one SQL table 008987 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL 008988 ** table linked to the same b-tree. If the secondary insert causes a 008989 ** rebalance, that can change content out from under the cursor on the 008990 ** first SQL table, violating invariants on the first insert. 008991 */ 008992 static int anotherValidCursor(BtCursor *pCur){ 008993 BtCursor *pOther; 008994 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){ 008995 if( pOther!=pCur 008996 && pOther->eState==CURSOR_VALID 008997 && pOther->pPage==pCur->pPage 008998 ){ 008999 return SQLITE_CORRUPT_PAGE(pCur->pPage); 009000 } 009001 } 009002 return SQLITE_OK; 009003 } 009004 009005 /* 009006 ** The page that pCur currently points to has just been modified in 009007 ** some way. This function figures out if this modification means the 009008 ** tree needs to be balanced, and if so calls the appropriate balancing 009009 ** routine. Balancing routines are: 009010 ** 009011 ** balance_quick() 009012 ** balance_deeper() 009013 ** balance_nonroot() 009014 */ 009015 static int balance(BtCursor *pCur){ 009016 int rc = SQLITE_OK; 009017 u8 aBalanceQuickSpace[13]; 009018 u8 *pFree = 0; 009019 009020 VVA_ONLY( int balance_quick_called = 0 ); 009021 VVA_ONLY( int balance_deeper_called = 0 ); 009022 009023 do { 009024 int iPage; 009025 MemPage *pPage = pCur->pPage; 009026 009027 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; 009028 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009029 /* No rebalance required as long as: 009030 ** (1) There are no overflow cells 009031 ** (2) The amount of free space on the page is less than 2/3rds of 009032 ** the total usable space on the page. */ 009033 break; 009034 }else if( (iPage = pCur->iPage)==0 ){ 009035 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ 009036 /* The root page of the b-tree is overfull. In this case call the 009037 ** balance_deeper() function to create a new child for the root-page 009038 ** and copy the current contents of the root-page to it. The 009039 ** next iteration of the do-loop will balance the child page. 009040 */ 009041 assert( balance_deeper_called==0 ); 009042 VVA_ONLY( balance_deeper_called++ ); 009043 rc = balance_deeper(pPage, &pCur->apPage[1]); 009044 if( rc==SQLITE_OK ){ 009045 pCur->iPage = 1; 009046 pCur->ix = 0; 009047 pCur->aiIdx[0] = 0; 009048 pCur->apPage[0] = pPage; 009049 pCur->pPage = pCur->apPage[1]; 009050 assert( pCur->pPage->nOverflow ); 009051 } 009052 }else{ 009053 break; 009054 } 009055 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ 009056 /* The page being written is not a root page, and there is currently 009057 ** more than one reference to it. This only happens if the page is one 009058 ** of its own ancestor pages. Corruption. */ 009059 rc = SQLITE_CORRUPT_PAGE(pPage); 009060 }else{ 009061 MemPage * const pParent = pCur->apPage[iPage-1]; 009062 int const iIdx = pCur->aiIdx[iPage-1]; 009063 009064 rc = sqlite3PagerWrite(pParent->pDbPage); 009065 if( rc==SQLITE_OK && pParent->nFree<0 ){ 009066 rc = btreeComputeFreeSpace(pParent); 009067 } 009068 if( rc==SQLITE_OK ){ 009069 #ifndef SQLITE_OMIT_QUICKBALANCE 009070 if( pPage->intKeyLeaf 009071 && pPage->nOverflow==1 009072 && pPage->aiOvfl[0]==pPage->nCell 009073 && pParent->pgno!=1 009074 && pParent->nCell==iIdx 009075 ){ 009076 /* Call balance_quick() to create a new sibling of pPage on which 009077 ** to store the overflow cell. balance_quick() inserts a new cell 009078 ** into pParent, which may cause pParent overflow. If this 009079 ** happens, the next iteration of the do-loop will balance pParent 009080 ** use either balance_nonroot() or balance_deeper(). Until this 009081 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 009082 ** buffer. 009083 ** 009084 ** The purpose of the following assert() is to check that only a 009085 ** single call to balance_quick() is made for each call to this 009086 ** function. If this were not verified, a subtle bug involving reuse 009087 ** of the aBalanceQuickSpace[] might sneak in. 009088 */ 009089 assert( balance_quick_called==0 ); 009090 VVA_ONLY( balance_quick_called++ ); 009091 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 009092 }else 009093 #endif 009094 { 009095 /* In this case, call balance_nonroot() to redistribute cells 009096 ** between pPage and up to 2 of its sibling pages. This involves 009097 ** modifying the contents of pParent, which may cause pParent to 009098 ** become overfull or underfull. The next iteration of the do-loop 009099 ** will balance the parent page to correct this. 009100 ** 009101 ** If the parent page becomes overfull, the overflow cell or cells 009102 ** are stored in the pSpace buffer allocated immediately below. 009103 ** A subsequent iteration of the do-loop will deal with this by 009104 ** calling balance_nonroot() (balance_deeper() may be called first, 009105 ** but it doesn't deal with overflow cells - just moves them to a 009106 ** different page). Once this subsequent call to balance_nonroot() 009107 ** has completed, it is safe to release the pSpace buffer used by 009108 ** the previous call, as the overflow cell data will have been 009109 ** copied either into the body of a database page or into the new 009110 ** pSpace buffer passed to the latter call to balance_nonroot(). 009111 */ 009112 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 009113 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, 009114 pCur->hints&BTREE_BULKLOAD); 009115 if( pFree ){ 009116 /* If pFree is not NULL, it points to the pSpace buffer used 009117 ** by a previous call to balance_nonroot(). Its contents are 009118 ** now stored either on real database pages or within the 009119 ** new pSpace buffer, so it may be safely freed here. */ 009120 sqlite3PageFree(pFree); 009121 } 009122 009123 /* The pSpace buffer will be freed after the next call to 009124 ** balance_nonroot(), or just before this function returns, whichever 009125 ** comes first. */ 009126 pFree = pSpace; 009127 } 009128 } 009129 009130 pPage->nOverflow = 0; 009131 009132 /* The next iteration of the do-loop balances the parent page. */ 009133 releasePage(pPage); 009134 pCur->iPage--; 009135 assert( pCur->iPage>=0 ); 009136 pCur->pPage = pCur->apPage[pCur->iPage]; 009137 } 009138 }while( rc==SQLITE_OK ); 009139 009140 if( pFree ){ 009141 sqlite3PageFree(pFree); 009142 } 009143 return rc; 009144 } 009145 009146 /* Overwrite content from pX into pDest. Only do the write if the 009147 ** content is different from what is already there. 009148 */ 009149 static int btreeOverwriteContent( 009150 MemPage *pPage, /* MemPage on which writing will occur */ 009151 u8 *pDest, /* Pointer to the place to start writing */ 009152 const BtreePayload *pX, /* Source of data to write */ 009153 int iOffset, /* Offset of first byte to write */ 009154 int iAmt /* Number of bytes to be written */ 009155 ){ 009156 int nData = pX->nData - iOffset; 009157 if( nData<=0 ){ 009158 /* Overwriting with zeros */ 009159 int i; 009160 for(i=0; i<iAmt && pDest[i]==0; i++){} 009161 if( i<iAmt ){ 009162 int rc = sqlite3PagerWrite(pPage->pDbPage); 009163 if( rc ) return rc; 009164 memset(pDest + i, 0, iAmt - i); 009165 } 009166 }else{ 009167 if( nData<iAmt ){ 009168 /* Mixed read data and zeros at the end. Make a recursive call 009169 ** to write the zeros then fall through to write the real data */ 009170 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData, 009171 iAmt-nData); 009172 if( rc ) return rc; 009173 iAmt = nData; 009174 } 009175 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){ 009176 int rc = sqlite3PagerWrite(pPage->pDbPage); 009177 if( rc ) return rc; 009178 /* In a corrupt database, it is possible for the source and destination 009179 ** buffers to overlap. This is harmless since the database is already 009180 ** corrupt but it does cause valgrind and ASAN warnings. So use 009181 ** memmove(). */ 009182 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt); 009183 } 009184 } 009185 return SQLITE_OK; 009186 } 009187 009188 /* 009189 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009190 ** contained in pX. In this variant, pCur is pointing to an overflow 009191 ** cell. 009192 */ 009193 static SQLITE_NOINLINE int btreeOverwriteOverflowCell( 009194 BtCursor *pCur, /* Cursor pointing to cell to overwrite */ 009195 const BtreePayload *pX /* Content to write into the cell */ 009196 ){ 009197 int iOffset; /* Next byte of pX->pData to write */ 009198 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009199 int rc; /* Return code */ 009200 MemPage *pPage = pCur->pPage; /* Page being written */ 009201 BtShared *pBt; /* Btree */ 009202 Pgno ovflPgno; /* Next overflow page to write */ 009203 u32 ovflPageSize; /* Size to write on overflow page */ 009204 009205 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */ 009206 009207 /* Overwrite the local portion first */ 009208 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009209 0, pCur->info.nLocal); 009210 if( rc ) return rc; 009211 009212 /* Now overwrite the overflow pages */ 009213 iOffset = pCur->info.nLocal; 009214 assert( nTotal>=0 ); 009215 assert( iOffset>=0 ); 009216 ovflPgno = get4byte(pCur->info.pPayload + iOffset); 009217 pBt = pPage->pBt; 009218 ovflPageSize = pBt->usableSize - 4; 009219 do{ 009220 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0); 009221 if( rc ) return rc; 009222 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){ 009223 rc = SQLITE_CORRUPT_PAGE(pPage); 009224 }else{ 009225 if( iOffset+ovflPageSize<(u32)nTotal ){ 009226 ovflPgno = get4byte(pPage->aData); 009227 }else{ 009228 ovflPageSize = nTotal - iOffset; 009229 } 009230 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX, 009231 iOffset, ovflPageSize); 009232 } 009233 sqlite3PagerUnref(pPage->pDbPage); 009234 if( rc ) return rc; 009235 iOffset += ovflPageSize; 009236 }while( iOffset<nTotal ); 009237 return SQLITE_OK; 009238 } 009239 009240 /* 009241 ** Overwrite the cell that cursor pCur is pointing to with fresh content 009242 ** contained in pX. 009243 */ 009244 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ 009245 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */ 009246 MemPage *pPage = pCur->pPage; /* Page being written */ 009247 009248 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd 009249 || pCur->info.pPayload < pPage->aData + pPage->cellOffset 009250 ){ 009251 return SQLITE_CORRUPT_PAGE(pPage); 009252 } 009253 if( pCur->info.nLocal==nTotal ){ 009254 /* The entire cell is local */ 009255 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX, 009256 0, pCur->info.nLocal); 009257 }else{ 009258 /* The cell contains overflow content */ 009259 return btreeOverwriteOverflowCell(pCur, pX); 009260 } 009261 } 009262 009263 009264 /* 009265 ** Insert a new record into the BTree. The content of the new record 009266 ** is described by the pX object. The pCur cursor is used only to 009267 ** define what table the record should be inserted into, and is left 009268 ** pointing at a random location. 009269 ** 009270 ** For a table btree (used for rowid tables), only the pX.nKey value of 009271 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the 009272 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields 009273 ** hold the content of the row. 009274 ** 009275 ** For an index btree (used for indexes and WITHOUT ROWID tables), the 009276 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The 009277 ** pX.pData,nData,nZero fields must be zero. 009278 ** 009279 ** If the seekResult parameter is non-zero, then a successful call to 009280 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already 009281 ** been performed. In other words, if seekResult!=0 then the cursor 009282 ** is currently pointing to a cell that will be adjacent to the cell 009283 ** to be inserted. If seekResult<0 then pCur points to a cell that is 009284 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell 009285 ** that is larger than (pKey,nKey). 009286 ** 009287 ** If seekResult==0, that means pCur is pointing at some unknown location. 009288 ** In that case, this routine must seek the cursor to the correct insertion 009289 ** point for (pKey,nKey) before doing the insertion. For index btrees, 009290 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked 009291 ** key values and pX->aMem can be used instead of pX->pKey to avoid having 009292 ** to decode the key. 009293 */ 009294 int sqlite3BtreeInsert( 009295 BtCursor *pCur, /* Insert data into the table of this cursor */ 009296 const BtreePayload *pX, /* Content of the row to be inserted */ 009297 int flags, /* True if this is likely an append */ 009298 int seekResult /* Result of prior IndexMoveto() call */ 009299 ){ 009300 int rc; 009301 int loc = seekResult; /* -1: before desired location +1: after */ 009302 int szNew = 0; 009303 int idx; 009304 MemPage *pPage; 009305 Btree *p = pCur->pBtree; 009306 unsigned char *oldCell; 009307 unsigned char *newCell = 0; 009308 009309 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags ); 009310 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 ); 009311 009312 /* Save the positions of any other cursors open on this table. 009313 ** 009314 ** In some cases, the call to btreeMoveto() below is a no-op. For 009315 ** example, when inserting data into a table with auto-generated integer 009316 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 009317 ** integer key to use. It then calls this function to actually insert the 009318 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 009319 ** that the cursor is already where it needs to be and returns without 009320 ** doing any work. To avoid thwarting these optimizations, it is important 009321 ** not to clear the cursor here. 009322 */ 009323 if( pCur->curFlags & BTCF_Multiple ){ 009324 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur); 009325 if( rc ) return rc; 009326 if( loc && pCur->iPage<0 ){ 009327 /* This can only happen if the schema is corrupt such that there is more 009328 ** than one table or index with the same root page as used by the cursor. 009329 ** Which can only happen if the SQLITE_NoSchemaError flag was set when 009330 ** the schema was loaded. This cannot be asserted though, as a user might 009331 ** set the flag, load the schema, and then unset the flag. */ 009332 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot); 009333 } 009334 } 009335 009336 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it 009337 ** points to a valid cell. 009338 */ 009339 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009340 testcase( pCur->eState==CURSOR_REQUIRESEEK ); 009341 testcase( pCur->eState==CURSOR_FAULT ); 009342 rc = moveToRoot(pCur); 009343 if( rc && rc!=SQLITE_EMPTY ) return rc; 009344 } 009345 009346 assert( cursorOwnsBtShared(pCur) ); 009347 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 009348 && p->pBt->inTransaction==TRANS_WRITE 009349 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 ); 009350 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009351 009352 /* Assert that the caller has been consistent. If this cursor was opened 009353 ** expecting an index b-tree, then the caller should be inserting blob 009354 ** keys with no associated data. If the cursor was opened expecting an 009355 ** intkey table, the caller should be inserting integer keys with a 009356 ** blob of associated data. */ 009357 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) ); 009358 009359 if( pCur->pKeyInfo==0 ){ 009360 assert( pX->pKey==0 ); 009361 /* If this is an insert into a table b-tree, invalidate any incrblob 009362 ** cursors open on the row being replaced */ 009363 if( p->hasIncrblobCur ){ 009364 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0); 009365 } 009366 009367 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009368 ** to a row with the same key as the new entry being inserted. 009369 */ 009370 #ifdef SQLITE_DEBUG 009371 if( flags & BTREE_SAVEPOSITION ){ 009372 assert( pCur->curFlags & BTCF_ValidNKey ); 009373 assert( pX->nKey==pCur->info.nKey ); 009374 assert( loc==0 ); 009375 } 009376 #endif 009377 009378 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply 009379 ** that the cursor is not pointing to a row to be overwritten. 009380 ** So do a complete check. 009381 */ 009382 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){ 009383 /* The cursor is pointing to the entry that is to be 009384 ** overwritten */ 009385 assert( pX->nData>=0 && pX->nZero>=0 ); 009386 if( pCur->info.nSize!=0 009387 && pCur->info.nPayload==(u32)pX->nData+pX->nZero 009388 ){ 009389 /* New entry is the same size as the old. Do an overwrite */ 009390 return btreeOverwriteCell(pCur, pX); 009391 } 009392 assert( loc==0 ); 009393 }else if( loc==0 ){ 009394 /* The cursor is *not* pointing to the cell to be overwritten, nor 009395 ** to an adjacent cell. Move the cursor so that it is pointing either 009396 ** to the cell to be overwritten or an adjacent cell. 009397 */ 009398 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey, 009399 (flags & BTREE_APPEND)!=0, &loc); 009400 if( rc ) return rc; 009401 } 009402 }else{ 009403 /* This is an index or a WITHOUT ROWID table */ 009404 009405 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 009406 ** to a row with the same key as the new entry being inserted. 009407 */ 009408 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 ); 009409 009410 /* If the cursor is not already pointing either to the cell to be 009411 ** overwritten, or if a new cell is being inserted, if the cursor is 009412 ** not pointing to an immediately adjacent cell, then move the cursor 009413 ** so that it does. 009414 */ 009415 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){ 009416 if( pX->nMem ){ 009417 UnpackedRecord r; 009418 r.pKeyInfo = pCur->pKeyInfo; 009419 r.aMem = pX->aMem; 009420 r.nField = pX->nMem; 009421 r.default_rc = 0; 009422 r.eqSeen = 0; 009423 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc); 009424 }else{ 009425 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, 009426 (flags & BTREE_APPEND)!=0, &loc); 009427 } 009428 if( rc ) return rc; 009429 } 009430 009431 /* If the cursor is currently pointing to an entry to be overwritten 009432 ** and the new content is the same as as the old, then use the 009433 ** overwrite optimization. 009434 */ 009435 if( loc==0 ){ 009436 getCellInfo(pCur); 009437 if( pCur->info.nKey==pX->nKey ){ 009438 BtreePayload x2; 009439 x2.pData = pX->pKey; 009440 x2.nData = pX->nKey; 009441 x2.nZero = 0; 009442 return btreeOverwriteCell(pCur, &x2); 009443 } 009444 } 009445 } 009446 assert( pCur->eState==CURSOR_VALID 009447 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB ); 009448 009449 pPage = pCur->pPage; 009450 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); 009451 assert( pPage->leaf || !pPage->intKey ); 009452 if( pPage->nFree<0 ){ 009453 if( NEVER(pCur->eState>CURSOR_INVALID) ){ 009454 /* ^^^^^--- due to the moveToRoot() call above */ 009455 rc = SQLITE_CORRUPT_PAGE(pPage); 009456 }else{ 009457 rc = btreeComputeFreeSpace(pPage); 009458 } 009459 if( rc ) return rc; 009460 } 009461 009462 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n", 009463 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, 009464 loc==0 ? "overwrite" : "new entry")); 009465 assert( pPage->isInit || CORRUPT_DB ); 009466 newCell = p->pBt->pTmpSpace; 009467 assert( newCell!=0 ); 009468 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT ); 009469 if( flags & BTREE_PREFORMAT ){ 009470 rc = SQLITE_OK; 009471 szNew = p->pBt->nPreformatSize; 009472 if( szNew<4 ){ 009473 szNew = 4; 009474 newCell[3] = 0; 009475 } 009476 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){ 009477 CellInfo info; 009478 pPage->xParseCell(pPage, newCell, &info); 009479 if( info.nPayload!=info.nLocal ){ 009480 Pgno ovfl = get4byte(&newCell[szNew-4]); 009481 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc); 009482 if( NEVER(rc) ) goto end_insert; 009483 } 009484 } 009485 }else{ 009486 rc = fillInCell(pPage, newCell, pX, &szNew); 009487 if( rc ) goto end_insert; 009488 } 009489 assert( szNew==pPage->xCellSize(pPage, newCell) ); 009490 assert( szNew <= MX_CELL_SIZE(p->pBt) ); 009491 idx = pCur->ix; 009492 pCur->info.nSize = 0; 009493 if( loc==0 ){ 009494 CellInfo info; 009495 assert( idx>=0 ); 009496 if( idx>=pPage->nCell ){ 009497 return SQLITE_CORRUPT_PAGE(pPage); 009498 } 009499 rc = sqlite3PagerWrite(pPage->pDbPage); 009500 if( rc ){ 009501 goto end_insert; 009502 } 009503 oldCell = findCell(pPage, idx); 009504 if( !pPage->leaf ){ 009505 memcpy(newCell, oldCell, 4); 009506 } 009507 BTREE_CLEAR_CELL(rc, pPage, oldCell, info); 009508 testcase( pCur->curFlags & BTCF_ValidOvfl ); 009509 invalidateOverflowCache(pCur); 009510 if( info.nSize==szNew && info.nLocal==info.nPayload 009511 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal) 009512 ){ 009513 /* Overwrite the old cell with the new if they are the same size. 009514 ** We could also try to do this if the old cell is smaller, then add 009515 ** the leftover space to the free list. But experiments show that 009516 ** doing that is no faster then skipping this optimization and just 009517 ** calling dropCell() and insertCell(). 009518 ** 009519 ** This optimization cannot be used on an autovacuum database if the 009520 ** new entry uses overflow pages, as the insertCell() call below is 009521 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */ 009522 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */ 009523 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){ 009524 return SQLITE_CORRUPT_PAGE(pPage); 009525 } 009526 if( oldCell+szNew > pPage->aDataEnd ){ 009527 return SQLITE_CORRUPT_PAGE(pPage); 009528 } 009529 memcpy(oldCell, newCell, szNew); 009530 return SQLITE_OK; 009531 } 009532 dropCell(pPage, idx, info.nSize, &rc); 009533 if( rc ) goto end_insert; 009534 }else if( loc<0 && pPage->nCell>0 ){ 009535 assert( pPage->leaf ); 009536 idx = ++pCur->ix; 009537 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 009538 }else{ 009539 assert( pPage->leaf ); 009540 } 009541 rc = insertCellFast(pPage, idx, newCell, szNew); 009542 assert( pPage->nOverflow==0 || rc==SQLITE_OK ); 009543 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 009544 009545 /* If no error has occurred and pPage has an overflow cell, call balance() 009546 ** to redistribute the cells within the tree. Since balance() may move 009547 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 009548 ** variables. 009549 ** 009550 ** Previous versions of SQLite called moveToRoot() to move the cursor 009551 ** back to the root page as balance() used to invalidate the contents 009552 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 009553 ** set the cursor state to "invalid". This makes common insert operations 009554 ** slightly faster. 009555 ** 009556 ** There is a subtle but important optimization here too. When inserting 009557 ** multiple records into an intkey b-tree using a single cursor (as can 009558 ** happen while processing an "INSERT INTO ... SELECT" statement), it 009559 ** is advantageous to leave the cursor pointing to the last entry in 009560 ** the b-tree if possible. If the cursor is left pointing to the last 009561 ** entry in the table, and the next row inserted has an integer key 009562 ** larger than the largest existing key, it is possible to insert the 009563 ** row without seeking the cursor. This can be a big performance boost. 009564 */ 009565 if( pPage->nOverflow ){ 009566 assert( rc==SQLITE_OK ); 009567 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 009568 rc = balance(pCur); 009569 009570 /* Must make sure nOverflow is reset to zero even if the balance() 009571 ** fails. Internal data structure corruption will result otherwise. 009572 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 009573 ** from trying to save the current position of the cursor. */ 009574 pCur->pPage->nOverflow = 0; 009575 pCur->eState = CURSOR_INVALID; 009576 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){ 009577 btreeReleaseAllCursorPages(pCur); 009578 if( pCur->pKeyInfo ){ 009579 assert( pCur->pKey==0 ); 009580 pCur->pKey = sqlite3Malloc( pX->nKey ); 009581 if( pCur->pKey==0 ){ 009582 rc = SQLITE_NOMEM; 009583 }else{ 009584 memcpy(pCur->pKey, pX->pKey, pX->nKey); 009585 } 009586 } 009587 pCur->eState = CURSOR_REQUIRESEEK; 009588 pCur->nKey = pX->nKey; 009589 } 009590 } 009591 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 ); 009592 009593 end_insert: 009594 return rc; 009595 } 009596 009597 /* 009598 ** This function is used as part of copying the current row from cursor 009599 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then 009600 ** parameter iKey is used as the rowid value when the record is copied 009601 ** into pDest. Otherwise, the record is copied verbatim. 009602 ** 009603 ** This function does not actually write the new value to cursor pDest. 009604 ** Instead, it creates and populates any required overflow pages and 009605 ** writes the data for the new cell into the BtShared.pTmpSpace buffer 009606 ** for the destination database. The size of the cell, in bytes, is left 009607 ** in BtShared.nPreformatSize. The caller completes the insertion by 009608 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified. 009609 ** 009610 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise. 009611 */ 009612 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){ 009613 BtShared *pBt = pDest->pBt; 009614 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */ 009615 const u8 *aIn; /* Pointer to next input buffer */ 009616 u32 nIn; /* Size of input buffer aIn[] */ 009617 u32 nRem; /* Bytes of data still to copy */ 009618 009619 getCellInfo(pSrc); 009620 if( pSrc->info.nPayload<0x80 ){ 009621 *(aOut++) = pSrc->info.nPayload; 009622 }else{ 009623 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); 009624 } 009625 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); 009626 nIn = pSrc->info.nLocal; 009627 aIn = pSrc->info.pPayload; 009628 if( aIn+nIn>pSrc->pPage->aDataEnd ){ 009629 return SQLITE_CORRUPT_PAGE(pSrc->pPage); 009630 } 009631 nRem = pSrc->info.nPayload; 009632 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){ 009633 memcpy(aOut, aIn, nIn); 009634 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace); 009635 return SQLITE_OK; 009636 }else{ 009637 int rc = SQLITE_OK; 009638 Pager *pSrcPager = pSrc->pBt->pPager; 009639 u8 *pPgnoOut = 0; 009640 Pgno ovflIn = 0; 009641 DbPage *pPageIn = 0; 009642 MemPage *pPageOut = 0; 009643 u32 nOut; /* Size of output buffer aOut[] */ 009644 009645 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload); 009646 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace); 009647 if( nOut<pSrc->info.nPayload ){ 009648 pPgnoOut = &aOut[nOut]; 009649 pBt->nPreformatSize += 4; 009650 } 009651 009652 if( nRem>nIn ){ 009653 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){ 009654 return SQLITE_CORRUPT_PAGE(pSrc->pPage); 009655 } 009656 ovflIn = get4byte(&pSrc->info.pPayload[nIn]); 009657 } 009658 009659 do { 009660 nRem -= nOut; 009661 do{ 009662 assert( nOut>0 ); 009663 if( nIn>0 ){ 009664 int nCopy = MIN(nOut, nIn); 009665 memcpy(aOut, aIn, nCopy); 009666 nOut -= nCopy; 009667 nIn -= nCopy; 009668 aOut += nCopy; 009669 aIn += nCopy; 009670 } 009671 if( nOut>0 ){ 009672 sqlite3PagerUnref(pPageIn); 009673 pPageIn = 0; 009674 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY); 009675 if( rc==SQLITE_OK ){ 009676 aIn = (const u8*)sqlite3PagerGetData(pPageIn); 009677 ovflIn = get4byte(aIn); 009678 aIn += 4; 009679 nIn = pSrc->pBt->usableSize - 4; 009680 } 009681 } 009682 }while( rc==SQLITE_OK && nOut>0 ); 009683 009684 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){ 009685 Pgno pgnoNew; 009686 MemPage *pNew = 0; 009687 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 009688 put4byte(pPgnoOut, pgnoNew); 009689 if( ISAUTOVACUUM(pBt) && pPageOut ){ 009690 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc); 009691 } 009692 releasePage(pPageOut); 009693 pPageOut = pNew; 009694 if( pPageOut ){ 009695 pPgnoOut = pPageOut->aData; 009696 put4byte(pPgnoOut, 0); 009697 aOut = &pPgnoOut[4]; 009698 nOut = MIN(pBt->usableSize - 4, nRem); 009699 } 009700 } 009701 }while( nRem>0 && rc==SQLITE_OK ); 009702 009703 releasePage(pPageOut); 009704 sqlite3PagerUnref(pPageIn); 009705 return rc; 009706 } 009707 } 009708 009709 /* 009710 ** Delete the entry that the cursor is pointing to. 009711 ** 009712 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then 009713 ** the cursor is left pointing at an arbitrary location after the delete. 009714 ** But if that bit is set, then the cursor is left in a state such that 009715 ** the next call to BtreeNext() or BtreePrev() moves it to the same row 009716 ** as it would have been on if the call to BtreeDelete() had been omitted. 009717 ** 009718 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes 009719 ** associated with a single table entry and its indexes. Only one of those 009720 ** deletes is considered the "primary" delete. The primary delete occurs 009721 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete 009722 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag. 009723 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation, 009724 ** but which might be used by alternative storage engines. 009725 */ 009726 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ 009727 Btree *p = pCur->pBtree; 009728 BtShared *pBt = p->pBt; 009729 int rc; /* Return code */ 009730 MemPage *pPage; /* Page to delete cell from */ 009731 unsigned char *pCell; /* Pointer to cell to delete */ 009732 int iCellIdx; /* Index of cell to delete */ 009733 int iCellDepth; /* Depth of node containing pCell */ 009734 CellInfo info; /* Size of the cell being deleted */ 009735 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */ 009736 009737 assert( cursorOwnsBtShared(pCur) ); 009738 assert( pBt->inTransaction==TRANS_WRITE ); 009739 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009740 assert( pCur->curFlags & BTCF_WriteFlag ); 009741 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 009742 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 009743 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 ); 009744 if( pCur->eState!=CURSOR_VALID ){ 009745 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 009746 rc = btreeRestoreCursorPosition(pCur); 009747 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID ); 009748 if( rc || pCur->eState!=CURSOR_VALID ) return rc; 009749 }else{ 009750 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot); 009751 } 009752 } 009753 assert( pCur->eState==CURSOR_VALID ); 009754 009755 iCellDepth = pCur->iPage; 009756 iCellIdx = pCur->ix; 009757 pPage = pCur->pPage; 009758 if( pPage->nCell<=iCellIdx ){ 009759 return SQLITE_CORRUPT_PAGE(pPage); 009760 } 009761 pCell = findCell(pPage, iCellIdx); 009762 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){ 009763 return SQLITE_CORRUPT_PAGE(pPage); 009764 } 009765 if( pCell<&pPage->aCellIdx[pPage->nCell] ){ 009766 return SQLITE_CORRUPT_PAGE(pPage); 009767 } 009768 009769 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must 009770 ** be preserved following this delete operation. If the current delete 009771 ** will cause a b-tree rebalance, then this is done by saving the cursor 009772 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 009773 ** returning. 009774 ** 009775 ** If the current delete will not cause a rebalance, then the cursor 009776 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately 009777 ** before or after the deleted entry. 009778 ** 009779 ** The bPreserve value records which path is required: 009780 ** 009781 ** bPreserve==0 Not necessary to save the cursor position 009782 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position 009783 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT. 009784 */ 009785 bPreserve = (flags & BTREE_SAVEPOSITION)!=0; 009786 if( bPreserve ){ 009787 if( !pPage->leaf 009788 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > 009789 (int)(pBt->usableSize*2/3) 009790 || pPage->nCell==1 /* See dbfuzz001.test for a test case */ 009791 ){ 009792 /* A b-tree rebalance will be required after deleting this entry. 009793 ** Save the cursor key. */ 009794 rc = saveCursorKey(pCur); 009795 if( rc ) return rc; 009796 }else{ 009797 bPreserve = 2; 009798 } 009799 } 009800 009801 /* If the page containing the entry to delete is not a leaf page, move 009802 ** the cursor to the largest entry in the tree that is smaller than 009803 ** the entry being deleted. This cell will replace the cell being deleted 009804 ** from the internal node. The 'previous' entry is used for this instead 009805 ** of the 'next' entry, as the previous entry is always a part of the 009806 ** sub-tree headed by the child page of the cell being deleted. This makes 009807 ** balancing the tree following the delete operation easier. */ 009808 if( !pPage->leaf ){ 009809 rc = sqlite3BtreePrevious(pCur, 0); 009810 assert( rc!=SQLITE_DONE ); 009811 if( rc ) return rc; 009812 } 009813 009814 /* Save the positions of any other cursors open on this table before 009815 ** making any modifications. */ 009816 if( pCur->curFlags & BTCF_Multiple ){ 009817 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 009818 if( rc ) return rc; 009819 } 009820 009821 /* If this is a delete operation to remove a row from a table b-tree, 009822 ** invalidate any incrblob cursors open on the row being deleted. */ 009823 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){ 009824 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0); 009825 } 009826 009827 /* Make the page containing the entry to be deleted writable. Then free any 009828 ** overflow pages associated with the entry and finally remove the cell 009829 ** itself from within the page. */ 009830 rc = sqlite3PagerWrite(pPage->pDbPage); 009831 if( rc ) return rc; 009832 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 009833 dropCell(pPage, iCellIdx, info.nSize, &rc); 009834 if( rc ) return rc; 009835 009836 /* If the cell deleted was not located on a leaf page, then the cursor 009837 ** is currently pointing to the largest entry in the sub-tree headed 009838 ** by the child-page of the cell that was just deleted from an internal 009839 ** node. The cell from the leaf node needs to be moved to the internal 009840 ** node to replace the deleted cell. */ 009841 if( !pPage->leaf ){ 009842 MemPage *pLeaf = pCur->pPage; 009843 int nCell; 009844 Pgno n; 009845 unsigned char *pTmp; 009846 009847 if( pLeaf->nFree<0 ){ 009848 rc = btreeComputeFreeSpace(pLeaf); 009849 if( rc ) return rc; 009850 } 009851 if( iCellDepth<pCur->iPage-1 ){ 009852 n = pCur->apPage[iCellDepth+1]->pgno; 009853 }else{ 009854 n = pCur->pPage->pgno; 009855 } 009856 pCell = findCell(pLeaf, pLeaf->nCell-1); 009857 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_PAGE(pLeaf); 009858 nCell = pLeaf->xCellSize(pLeaf, pCell); 009859 assert( MX_CELL_SIZE(pBt) >= nCell ); 009860 pTmp = pBt->pTmpSpace; 009861 assert( pTmp!=0 ); 009862 rc = sqlite3PagerWrite(pLeaf->pDbPage); 009863 if( rc==SQLITE_OK ){ 009864 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n); 009865 } 009866 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 009867 if( rc ) return rc; 009868 } 009869 009870 /* Balance the tree. If the entry deleted was located on a leaf page, 009871 ** then the cursor still points to that page. In this case the first 009872 ** call to balance() repairs the tree, and the if(...) condition is 009873 ** never true. 009874 ** 009875 ** Otherwise, if the entry deleted was on an internal node page, then 009876 ** pCur is pointing to the leaf page from which a cell was removed to 009877 ** replace the cell deleted from the internal node. This is slightly 009878 ** tricky as the leaf node may be underfull, and the internal node may 009879 ** be either under or overfull. In this case run the balancing algorithm 009880 ** on the leaf node first. If the balance proceeds far enough up the 009881 ** tree that we can be sure that any problem in the internal node has 009882 ** been corrected, so be it. Otherwise, after balancing the leaf node, 009883 ** walk the cursor up the tree to the internal node and balance it as 009884 ** well. */ 009885 assert( pCur->pPage->nOverflow==0 ); 009886 assert( pCur->pPage->nFree>=0 ); 009887 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ 009888 /* Optimization: If the free space is less than 2/3rds of the page, 009889 ** then balance() will always be a no-op. No need to invoke it. */ 009890 rc = SQLITE_OK; 009891 }else{ 009892 rc = balance(pCur); 009893 } 009894 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 009895 releasePageNotNull(pCur->pPage); 009896 pCur->iPage--; 009897 while( pCur->iPage>iCellDepth ){ 009898 releasePage(pCur->apPage[pCur->iPage--]); 009899 } 009900 pCur->pPage = pCur->apPage[pCur->iPage]; 009901 rc = balance(pCur); 009902 } 009903 009904 if( rc==SQLITE_OK ){ 009905 if( bPreserve>1 ){ 009906 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) ); 009907 assert( pPage==pCur->pPage || CORRUPT_DB ); 009908 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell ); 009909 pCur->eState = CURSOR_SKIPNEXT; 009910 if( iCellIdx>=pPage->nCell ){ 009911 pCur->skipNext = -1; 009912 pCur->ix = pPage->nCell-1; 009913 }else{ 009914 pCur->skipNext = 1; 009915 } 009916 }else{ 009917 rc = moveToRoot(pCur); 009918 if( bPreserve ){ 009919 btreeReleaseAllCursorPages(pCur); 009920 pCur->eState = CURSOR_REQUIRESEEK; 009921 } 009922 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK; 009923 } 009924 } 009925 return rc; 009926 } 009927 009928 /* 009929 ** Create a new BTree table. Write into *piTable the page 009930 ** number for the root page of the new table. 009931 ** 009932 ** The type of type is determined by the flags parameter. Only the 009933 ** following values of flags are currently in use. Other values for 009934 ** flags might not work: 009935 ** 009936 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 009937 ** BTREE_ZERODATA Used for SQL indices 009938 */ 009939 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){ 009940 BtShared *pBt = p->pBt; 009941 MemPage *pRoot; 009942 Pgno pgnoRoot; 009943 int rc; 009944 int ptfFlags; /* Page-type flags for the root page of new table */ 009945 009946 assert( sqlite3BtreeHoldsMutex(p) ); 009947 assert( pBt->inTransaction==TRANS_WRITE ); 009948 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 009949 009950 #ifdef SQLITE_OMIT_AUTOVACUUM 009951 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 009952 if( rc ){ 009953 return rc; 009954 } 009955 #else 009956 if( pBt->autoVacuum ){ 009957 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 009958 MemPage *pPageMove; /* The page to move to. */ 009959 009960 /* Creating a new table may probably require moving an existing database 009961 ** to make room for the new tables root page. In case this page turns 009962 ** out to be an overflow page, delete all overflow page-map caches 009963 ** held by open cursors. 009964 */ 009965 invalidateAllOverflowCache(pBt); 009966 009967 /* Read the value of meta[3] from the database to determine where the 009968 ** root page of the new table should go. meta[3] is the largest root-page 009969 ** created so far, so the new root-page is (meta[3]+1). 009970 */ 009971 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 009972 if( pgnoRoot>btreePagecount(pBt) ){ 009973 return SQLITE_CORRUPT_PGNO(pgnoRoot); 009974 } 009975 pgnoRoot++; 009976 009977 /* The new root-page may not be allocated on a pointer-map page, or the 009978 ** PENDING_BYTE page. 009979 */ 009980 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 009981 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 009982 pgnoRoot++; 009983 } 009984 assert( pgnoRoot>=3 ); 009985 009986 /* Allocate a page. The page that currently resides at pgnoRoot will 009987 ** be moved to the allocated page (unless the allocated page happens 009988 ** to reside at pgnoRoot). 009989 */ 009990 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 009991 if( rc!=SQLITE_OK ){ 009992 return rc; 009993 } 009994 009995 if( pgnoMove!=pgnoRoot ){ 009996 /* pgnoRoot is the page that will be used for the root-page of 009997 ** the new table (assuming an error did not occur). But we were 009998 ** allocated pgnoMove. If required (i.e. if it was not allocated 009999 ** by extending the file), the current page at position pgnoMove 010000 ** is already journaled. 010001 */ 010002 u8 eType = 0; 010003 Pgno iPtrPage = 0; 010004 010005 /* Save the positions of any open cursors. This is required in 010006 ** case they are holding a reference to an xFetch reference 010007 ** corresponding to page pgnoRoot. */ 010008 rc = saveAllCursors(pBt, 0, 0); 010009 releasePage(pPageMove); 010010 if( rc!=SQLITE_OK ){ 010011 return rc; 010012 } 010013 010014 /* Move the page currently at pgnoRoot to pgnoMove. */ 010015 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 010016 if( rc!=SQLITE_OK ){ 010017 return rc; 010018 } 010019 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 010020 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 010021 rc = SQLITE_CORRUPT_PGNO(pgnoRoot); 010022 } 010023 if( rc!=SQLITE_OK ){ 010024 releasePage(pRoot); 010025 return rc; 010026 } 010027 assert( eType!=PTRMAP_ROOTPAGE ); 010028 assert( eType!=PTRMAP_FREEPAGE ); 010029 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 010030 releasePage(pRoot); 010031 010032 /* Obtain the page at pgnoRoot */ 010033 if( rc!=SQLITE_OK ){ 010034 return rc; 010035 } 010036 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 010037 if( rc!=SQLITE_OK ){ 010038 return rc; 010039 } 010040 rc = sqlite3PagerWrite(pRoot->pDbPage); 010041 if( rc!=SQLITE_OK ){ 010042 releasePage(pRoot); 010043 return rc; 010044 } 010045 }else{ 010046 pRoot = pPageMove; 010047 } 010048 010049 /* Update the pointer-map and meta-data with the new root-page number. */ 010050 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 010051 if( rc ){ 010052 releasePage(pRoot); 010053 return rc; 010054 } 010055 010056 /* When the new root page was allocated, page 1 was made writable in 010057 ** order either to increase the database filesize, or to decrement the 010058 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 010059 */ 010060 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 010061 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 010062 if( NEVER(rc) ){ 010063 releasePage(pRoot); 010064 return rc; 010065 } 010066 010067 }else{ 010068 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 010069 if( rc ) return rc; 010070 } 010071 #endif 010072 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 010073 if( createTabFlags & BTREE_INTKEY ){ 010074 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 010075 }else{ 010076 ptfFlags = PTF_ZERODATA | PTF_LEAF; 010077 } 010078 zeroPage(pRoot, ptfFlags); 010079 sqlite3PagerUnref(pRoot->pDbPage); 010080 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 010081 *piTable = pgnoRoot; 010082 return SQLITE_OK; 010083 } 010084 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){ 010085 int rc; 010086 sqlite3BtreeEnter(p); 010087 rc = btreeCreateTable(p, piTable, flags); 010088 sqlite3BtreeLeave(p); 010089 return rc; 010090 } 010091 010092 /* 010093 ** Erase the given database page and all its children. Return 010094 ** the page to the freelist. 010095 */ 010096 static int clearDatabasePage( 010097 BtShared *pBt, /* The BTree that contains the table */ 010098 Pgno pgno, /* Page number to clear */ 010099 int freePageFlag, /* Deallocate page if true */ 010100 i64 *pnChange /* Add number of Cells freed to this counter */ 010101 ){ 010102 MemPage *pPage; 010103 int rc; 010104 unsigned char *pCell; 010105 int i; 010106 int hdr; 010107 CellInfo info; 010108 010109 assert( sqlite3_mutex_held(pBt->mutex) ); 010110 if( pgno>btreePagecount(pBt) ){ 010111 return SQLITE_CORRUPT_PGNO(pgno); 010112 } 010113 rc = getAndInitPage(pBt, pgno, &pPage, 0); 010114 if( rc ) return rc; 010115 if( (pBt->openFlags & BTREE_SINGLE)==0 010116 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1)) 010117 ){ 010118 rc = SQLITE_CORRUPT_PAGE(pPage); 010119 goto cleardatabasepage_out; 010120 } 010121 hdr = pPage->hdrOffset; 010122 for(i=0; i<pPage->nCell; i++){ 010123 pCell = findCell(pPage, i); 010124 if( !pPage->leaf ){ 010125 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 010126 if( rc ) goto cleardatabasepage_out; 010127 } 010128 BTREE_CLEAR_CELL(rc, pPage, pCell, info); 010129 if( rc ) goto cleardatabasepage_out; 010130 } 010131 if( !pPage->leaf ){ 010132 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 010133 if( rc ) goto cleardatabasepage_out; 010134 if( pPage->intKey ) pnChange = 0; 010135 } 010136 if( pnChange ){ 010137 testcase( !pPage->intKey ); 010138 *pnChange += pPage->nCell; 010139 } 010140 if( freePageFlag ){ 010141 freePage(pPage, &rc); 010142 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 010143 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 010144 } 010145 010146 cleardatabasepage_out: 010147 releasePage(pPage); 010148 return rc; 010149 } 010150 010151 /* 010152 ** Delete all information from a single table in the database. iTable is 010153 ** the page number of the root of the table. After this routine returns, 010154 ** the root page is empty, but still exists. 010155 ** 010156 ** This routine will fail with SQLITE_LOCKED if there are any open 010157 ** read cursors on the table. Open write cursors are moved to the 010158 ** root of the table. 010159 ** 010160 ** If pnChange is not NULL, then the integer value pointed to by pnChange 010161 ** is incremented by the number of entries in the table. 010162 */ 010163 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){ 010164 int rc; 010165 BtShared *pBt = p->pBt; 010166 sqlite3BtreeEnter(p); 010167 assert( p->inTrans==TRANS_WRITE ); 010168 010169 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 010170 010171 if( SQLITE_OK==rc ){ 010172 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 010173 ** is the root of a table b-tree - if it is not, the following call is 010174 ** a no-op). */ 010175 if( p->hasIncrblobCur ){ 010176 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1); 010177 } 010178 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 010179 } 010180 sqlite3BtreeLeave(p); 010181 return rc; 010182 } 010183 010184 /* 010185 ** Delete all information from the single table that pCur is open on. 010186 ** 010187 ** This routine only work for pCur on an ephemeral table. 010188 */ 010189 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 010190 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 010191 } 010192 010193 /* 010194 ** Erase all information in a table and add the root of the table to 010195 ** the freelist. Except, the root of the principle table (the one on 010196 ** page 1) is never added to the freelist. 010197 ** 010198 ** This routine will fail with SQLITE_LOCKED if there are any open 010199 ** cursors on the table. 010200 ** 010201 ** If AUTOVACUUM is enabled and the page at iTable is not the last 010202 ** root page in the database file, then the last root page 010203 ** in the database file is moved into the slot formerly occupied by 010204 ** iTable and that last slot formerly occupied by the last root page 010205 ** is added to the freelist instead of iTable. In this say, all 010206 ** root pages are kept at the beginning of the database file, which 010207 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 010208 ** page number that used to be the last root page in the file before 010209 ** the move. If no page gets moved, *piMoved is set to 0. 010210 ** The last root page is recorded in meta[3] and the value of 010211 ** meta[3] is updated by this procedure. 010212 */ 010213 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 010214 int rc; 010215 MemPage *pPage = 0; 010216 BtShared *pBt = p->pBt; 010217 010218 assert( sqlite3BtreeHoldsMutex(p) ); 010219 assert( p->inTrans==TRANS_WRITE ); 010220 assert( iTable>=2 ); 010221 if( iTable>btreePagecount(pBt) ){ 010222 return SQLITE_CORRUPT_PGNO(iTable); 010223 } 010224 010225 rc = sqlite3BtreeClearTable(p, iTable, 0); 010226 if( rc ) return rc; 010227 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 010228 if( NEVER(rc) ){ 010229 releasePage(pPage); 010230 return rc; 010231 } 010232 010233 *piMoved = 0; 010234 010235 #ifdef SQLITE_OMIT_AUTOVACUUM 010236 freePage(pPage, &rc); 010237 releasePage(pPage); 010238 #else 010239 if( pBt->autoVacuum ){ 010240 Pgno maxRootPgno; 010241 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 010242 010243 if( iTable==maxRootPgno ){ 010244 /* If the table being dropped is the table with the largest root-page 010245 ** number in the database, put the root page on the free list. 010246 */ 010247 freePage(pPage, &rc); 010248 releasePage(pPage); 010249 if( rc!=SQLITE_OK ){ 010250 return rc; 010251 } 010252 }else{ 010253 /* The table being dropped does not have the largest root-page 010254 ** number in the database. So move the page that does into the 010255 ** gap left by the deleted root-page. 010256 */ 010257 MemPage *pMove; 010258 releasePage(pPage); 010259 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010260 if( rc!=SQLITE_OK ){ 010261 return rc; 010262 } 010263 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 010264 releasePage(pMove); 010265 if( rc!=SQLITE_OK ){ 010266 return rc; 010267 } 010268 pMove = 0; 010269 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 010270 freePage(pMove, &rc); 010271 releasePage(pMove); 010272 if( rc!=SQLITE_OK ){ 010273 return rc; 010274 } 010275 *piMoved = maxRootPgno; 010276 } 010277 010278 /* Set the new 'max-root-page' value in the database header. This 010279 ** is the old value less one, less one more if that happens to 010280 ** be a root-page number, less one again if that is the 010281 ** PENDING_BYTE_PAGE. 010282 */ 010283 maxRootPgno--; 010284 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 010285 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 010286 maxRootPgno--; 010287 } 010288 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 010289 010290 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 010291 }else{ 010292 freePage(pPage, &rc); 010293 releasePage(pPage); 010294 } 010295 #endif 010296 return rc; 010297 } 010298 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 010299 int rc; 010300 sqlite3BtreeEnter(p); 010301 rc = btreeDropTable(p, iTable, piMoved); 010302 sqlite3BtreeLeave(p); 010303 return rc; 010304 } 010305 010306 010307 /* 010308 ** This function may only be called if the b-tree connection already 010309 ** has a read or write transaction open on the database. 010310 ** 010311 ** Read the meta-information out of a database file. Meta[0] 010312 ** is the number of free pages currently in the database. Meta[1] 010313 ** through meta[15] are available for use by higher layers. Meta[0] 010314 ** is read-only, the others are read/write. 010315 ** 010316 ** The schema layer numbers meta values differently. At the schema 010317 ** layer (and the SetCookie and ReadCookie opcodes) the number of 010318 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 010319 ** 010320 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead 010321 ** of reading the value out of the header, it instead loads the "DataVersion" 010322 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the 010323 ** database file. It is a number computed by the pager. But its access 010324 ** pattern is the same as header meta values, and so it is convenient to 010325 ** read it from this routine. 010326 */ 010327 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 010328 BtShared *pBt = p->pBt; 010329 010330 sqlite3BtreeEnter(p); 010331 assert( p->inTrans>TRANS_NONE ); 010332 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) ); 010333 assert( pBt->pPage1 ); 010334 assert( idx>=0 && idx<=15 ); 010335 010336 if( idx==BTREE_DATA_VERSION ){ 010337 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion; 010338 }else{ 010339 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 010340 } 010341 010342 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 010343 ** database, mark the database as read-only. */ 010344 #ifdef SQLITE_OMIT_AUTOVACUUM 010345 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 010346 pBt->btsFlags |= BTS_READ_ONLY; 010347 } 010348 #endif 010349 010350 sqlite3BtreeLeave(p); 010351 } 010352 010353 /* 010354 ** Write meta-information back into the database. Meta[0] is 010355 ** read-only and may not be written. 010356 */ 010357 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 010358 BtShared *pBt = p->pBt; 010359 unsigned char *pP1; 010360 int rc; 010361 assert( idx>=1 && idx<=15 ); 010362 sqlite3BtreeEnter(p); 010363 assert( p->inTrans==TRANS_WRITE ); 010364 assert( pBt->pPage1!=0 ); 010365 pP1 = pBt->pPage1->aData; 010366 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 010367 if( rc==SQLITE_OK ){ 010368 put4byte(&pP1[36 + idx*4], iMeta); 010369 #ifndef SQLITE_OMIT_AUTOVACUUM 010370 if( idx==BTREE_INCR_VACUUM ){ 010371 assert( pBt->autoVacuum || iMeta==0 ); 010372 assert( iMeta==0 || iMeta==1 ); 010373 pBt->incrVacuum = (u8)iMeta; 010374 } 010375 #endif 010376 } 010377 sqlite3BtreeLeave(p); 010378 return rc; 010379 } 010380 010381 /* 010382 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 010383 ** number of entries in the b-tree and write the result to *pnEntry. 010384 ** 010385 ** SQLITE_OK is returned if the operation is successfully executed. 010386 ** Otherwise, if an error is encountered (i.e. an IO error or database 010387 ** corruption) an SQLite error code is returned. 010388 */ 010389 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){ 010390 i64 nEntry = 0; /* Value to return in *pnEntry */ 010391 int rc; /* Return code */ 010392 010393 rc = moveToRoot(pCur); 010394 if( rc==SQLITE_EMPTY ){ 010395 *pnEntry = 0; 010396 return SQLITE_OK; 010397 } 010398 010399 /* Unless an error occurs, the following loop runs one iteration for each 010400 ** page in the B-Tree structure (not including overflow pages). 010401 */ 010402 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){ 010403 int iIdx; /* Index of child node in parent */ 010404 MemPage *pPage; /* Current page of the b-tree */ 010405 010406 /* If this is a leaf page or the tree is not an int-key tree, then 010407 ** this page contains countable entries. Increment the entry counter 010408 ** accordingly. 010409 */ 010410 pPage = pCur->pPage; 010411 if( pPage->leaf || !pPage->intKey ){ 010412 nEntry += pPage->nCell; 010413 } 010414 010415 /* pPage is a leaf node. This loop navigates the cursor so that it 010416 ** points to the first interior cell that it points to the parent of 010417 ** the next page in the tree that has not yet been visited. The 010418 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 010419 ** of the page, or to the number of cells in the page if the next page 010420 ** to visit is the right-child of its parent. 010421 ** 010422 ** If all pages in the tree have been visited, return SQLITE_OK to the 010423 ** caller. 010424 */ 010425 if( pPage->leaf ){ 010426 do { 010427 if( pCur->iPage==0 ){ 010428 /* All pages of the b-tree have been visited. Return successfully. */ 010429 *pnEntry = nEntry; 010430 return moveToRoot(pCur); 010431 } 010432 moveToParent(pCur); 010433 }while ( pCur->ix>=pCur->pPage->nCell ); 010434 010435 pCur->ix++; 010436 pPage = pCur->pPage; 010437 } 010438 010439 /* Descend to the child node of the cell that the cursor currently 010440 ** points at. This is the right-child if (iIdx==pPage->nCell). 010441 */ 010442 iIdx = pCur->ix; 010443 if( iIdx==pPage->nCell ){ 010444 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 010445 }else{ 010446 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 010447 } 010448 } 010449 010450 /* An error has occurred. Return an error code. */ 010451 return rc; 010452 } 010453 010454 /* 010455 ** Return the pager associated with a BTree. This routine is used for 010456 ** testing and debugging only. 010457 */ 010458 Pager *sqlite3BtreePager(Btree *p){ 010459 return p->pBt->pPager; 010460 } 010461 010462 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010463 /* 010464 ** Record an OOM error during integrity_check 010465 */ 010466 static void checkOom(IntegrityCk *pCheck){ 010467 pCheck->rc = SQLITE_NOMEM; 010468 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */ 010469 if( pCheck->nErr==0 ) pCheck->nErr++; 010470 } 010471 010472 /* 010473 ** Invoke the progress handler, if appropriate. Also check for an 010474 ** interrupt. 010475 */ 010476 static void checkProgress(IntegrityCk *pCheck){ 010477 sqlite3 *db = pCheck->db; 010478 if( AtomicLoad(&db->u1.isInterrupted) ){ 010479 pCheck->rc = SQLITE_INTERRUPT; 010480 pCheck->nErr++; 010481 pCheck->mxErr = 0; 010482 } 010483 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK 010484 if( db->xProgress ){ 010485 assert( db->nProgressOps>0 ); 010486 pCheck->nStep++; 010487 if( (pCheck->nStep % db->nProgressOps)==0 010488 && db->xProgress(db->pProgressArg) 010489 ){ 010490 pCheck->rc = SQLITE_INTERRUPT; 010491 pCheck->nErr++; 010492 pCheck->mxErr = 0; 010493 } 010494 } 010495 #endif 010496 } 010497 010498 /* 010499 ** Append a message to the error message string. 010500 */ 010501 static void checkAppendMsg( 010502 IntegrityCk *pCheck, 010503 const char *zFormat, 010504 ... 010505 ){ 010506 va_list ap; 010507 checkProgress(pCheck); 010508 if( !pCheck->mxErr ) return; 010509 pCheck->mxErr--; 010510 pCheck->nErr++; 010511 va_start(ap, zFormat); 010512 if( pCheck->errMsg.nChar ){ 010513 sqlite3_str_append(&pCheck->errMsg, "\n", 1); 010514 } 010515 if( pCheck->zPfx ){ 010516 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, 010517 pCheck->v0, pCheck->v1, pCheck->v2); 010518 } 010519 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap); 010520 va_end(ap); 010521 if( pCheck->errMsg.accError==SQLITE_NOMEM ){ 010522 checkOom(pCheck); 010523 } 010524 } 010525 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010526 010527 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010528 010529 /* 010530 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 010531 ** corresponds to page iPg is already set. 010532 */ 010533 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010534 assert( pCheck->aPgRef!=0 ); 010535 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010536 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 010537 } 010538 010539 /* 010540 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 010541 */ 010542 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 010543 assert( pCheck->aPgRef!=0 ); 010544 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 ); 010545 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 010546 } 010547 010548 010549 /* 010550 ** Add 1 to the reference count for page iPage. If this is the second 010551 ** reference to the page, add an error message to pCheck->zErrMsg. 010552 ** Return 1 if there are 2 or more references to the page and 0 if 010553 ** if this is the first reference to the page. 010554 ** 010555 ** Also check that the page number is in bounds. 010556 */ 010557 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 010558 if( iPage>pCheck->nCkPage || iPage==0 ){ 010559 checkAppendMsg(pCheck, "invalid page number %u", iPage); 010560 return 1; 010561 } 010562 if( getPageReferenced(pCheck, iPage) ){ 010563 checkAppendMsg(pCheck, "2nd reference to page %u", iPage); 010564 return 1; 010565 } 010566 setPageReferenced(pCheck, iPage); 010567 return 0; 010568 } 010569 010570 #ifndef SQLITE_OMIT_AUTOVACUUM 010571 /* 010572 ** Check that the entry in the pointer-map for page iChild maps to 010573 ** page iParent, pointer type ptrType. If not, append an error message 010574 ** to pCheck. 010575 */ 010576 static void checkPtrmap( 010577 IntegrityCk *pCheck, /* Integrity check context */ 010578 Pgno iChild, /* Child page number */ 010579 u8 eType, /* Expected pointer map type */ 010580 Pgno iParent /* Expected pointer map parent page number */ 010581 ){ 010582 int rc; 010583 u8 ePtrmapType; 010584 Pgno iPtrmapParent; 010585 010586 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 010587 if( rc!=SQLITE_OK ){ 010588 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck); 010589 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild); 010590 return; 010591 } 010592 010593 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 010594 checkAppendMsg(pCheck, 010595 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)", 010596 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 010597 } 010598 } 010599 #endif 010600 010601 /* 010602 ** Check the integrity of the freelist or of an overflow page list. 010603 ** Verify that the number of pages on the list is N. 010604 */ 010605 static void checkList( 010606 IntegrityCk *pCheck, /* Integrity checking context */ 010607 int isFreeList, /* True for a freelist. False for overflow page list */ 010608 Pgno iPage, /* Page number for first page in the list */ 010609 u32 N /* Expected number of pages in the list */ 010610 ){ 010611 int i; 010612 u32 expected = N; 010613 int nErrAtStart = pCheck->nErr; 010614 while( iPage!=0 && pCheck->mxErr ){ 010615 DbPage *pOvflPage; 010616 unsigned char *pOvflData; 010617 if( checkRef(pCheck, iPage) ) break; 010618 N--; 010619 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){ 010620 checkAppendMsg(pCheck, "failed to get page %u", iPage); 010621 break; 010622 } 010623 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 010624 if( isFreeList ){ 010625 u32 n = (u32)get4byte(&pOvflData[4]); 010626 #ifndef SQLITE_OMIT_AUTOVACUUM 010627 if( pCheck->pBt->autoVacuum ){ 010628 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 010629 } 010630 #endif 010631 if( n>pCheck->pBt->usableSize/4-2 ){ 010632 checkAppendMsg(pCheck, 010633 "freelist leaf count too big on page %u", iPage); 010634 N--; 010635 }else{ 010636 for(i=0; i<(int)n; i++){ 010637 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 010638 #ifndef SQLITE_OMIT_AUTOVACUUM 010639 if( pCheck->pBt->autoVacuum ){ 010640 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 010641 } 010642 #endif 010643 checkRef(pCheck, iFreePage); 010644 } 010645 N -= n; 010646 } 010647 } 010648 #ifndef SQLITE_OMIT_AUTOVACUUM 010649 else{ 010650 /* If this database supports auto-vacuum and iPage is not the last 010651 ** page in this overflow list, check that the pointer-map entry for 010652 ** the following page matches iPage. 010653 */ 010654 if( pCheck->pBt->autoVacuum && N>0 ){ 010655 i = get4byte(pOvflData); 010656 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 010657 } 010658 } 010659 #endif 010660 iPage = get4byte(pOvflData); 010661 sqlite3PagerUnref(pOvflPage); 010662 } 010663 if( N && nErrAtStart==pCheck->nErr ){ 010664 checkAppendMsg(pCheck, 010665 "%s is %u but should be %u", 010666 isFreeList ? "size" : "overflow list length", 010667 expected-N, expected); 010668 } 010669 } 010670 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 010671 010672 /* 010673 ** An implementation of a min-heap. 010674 ** 010675 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the 010676 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2] 010677 ** and aHeap[N*2+1]. 010678 ** 010679 ** The heap property is this: Every node is less than or equal to both 010680 ** of its daughter nodes. A consequence of the heap property is that the 010681 ** root node aHeap[1] is always the minimum value currently in the heap. 010682 ** 010683 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto 010684 ** the heap, preserving the heap property. The btreeHeapPull() routine 010685 ** removes the root element from the heap (the minimum value in the heap) 010686 ** and then moves other nodes around as necessary to preserve the heap 010687 ** property. 010688 ** 010689 ** This heap is used for cell overlap and coverage testing. Each u32 010690 ** entry represents the span of a cell or freeblock on a btree page. 010691 ** The upper 16 bits are the index of the first byte of a range and the 010692 ** lower 16 bits are the index of the last byte of that range. 010693 */ 010694 static void btreeHeapInsert(u32 *aHeap, u32 x){ 010695 u32 j, i; 010696 assert( aHeap!=0 ); 010697 i = ++aHeap[0]; 010698 aHeap[i] = x; 010699 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){ 010700 x = aHeap[j]; 010701 aHeap[j] = aHeap[i]; 010702 aHeap[i] = x; 010703 i = j; 010704 } 010705 } 010706 static int btreeHeapPull(u32 *aHeap, u32 *pOut){ 010707 u32 j, i, x; 010708 if( (x = aHeap[0])==0 ) return 0; 010709 *pOut = aHeap[1]; 010710 aHeap[1] = aHeap[x]; 010711 aHeap[x] = 0xffffffff; 010712 aHeap[0]--; 010713 i = 1; 010714 while( (j = i*2)<=aHeap[0] ){ 010715 if( aHeap[j]>aHeap[j+1] ) j++; 010716 if( aHeap[i]<aHeap[j] ) break; 010717 x = aHeap[i]; 010718 aHeap[i] = aHeap[j]; 010719 aHeap[j] = x; 010720 i = j; 010721 } 010722 return 1; 010723 } 010724 010725 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 010726 /* 010727 ** Do various sanity checks on a single page of a tree. Return 010728 ** the tree depth. Root pages return 0. Parents of root pages 010729 ** return 1, and so forth. 010730 ** 010731 ** These checks are done: 010732 ** 010733 ** 1. Make sure that cells and freeblocks do not overlap 010734 ** but combine to completely cover the page. 010735 ** 2. Make sure integer cell keys are in order. 010736 ** 3. Check the integrity of overflow pages. 010737 ** 4. Recursively call checkTreePage on all children. 010738 ** 5. Verify that the depth of all children is the same. 010739 */ 010740 static int checkTreePage( 010741 IntegrityCk *pCheck, /* Context for the sanity check */ 010742 Pgno iPage, /* Page number of the page to check */ 010743 i64 *piMinKey, /* Write minimum integer primary key here */ 010744 i64 maxKey /* Error if integer primary key greater than this */ 010745 ){ 010746 MemPage *pPage = 0; /* The page being analyzed */ 010747 int i; /* Loop counter */ 010748 int rc; /* Result code from subroutine call */ 010749 int depth = -1, d2; /* Depth of a subtree */ 010750 int pgno; /* Page number */ 010751 int nFrag; /* Number of fragmented bytes on the page */ 010752 int hdr; /* Offset to the page header */ 010753 int cellStart; /* Offset to the start of the cell pointer array */ 010754 int nCell; /* Number of cells */ 010755 int doCoverageCheck = 1; /* True if cell coverage checking should be done */ 010756 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey 010757 ** False if IPK must be strictly less than maxKey */ 010758 u8 *data; /* Page content */ 010759 u8 *pCell; /* Cell content */ 010760 u8 *pCellIdx; /* Next element of the cell pointer array */ 010761 BtShared *pBt; /* The BtShared object that owns pPage */ 010762 u32 pc; /* Address of a cell */ 010763 u32 usableSize; /* Usable size of the page */ 010764 u32 contentOffset; /* Offset to the start of the cell content area */ 010765 u32 *heap = 0; /* Min-heap used for checking cell coverage */ 010766 u32 x, prev = 0; /* Next and previous entry on the min-heap */ 010767 const char *saved_zPfx = pCheck->zPfx; 010768 int saved_v1 = pCheck->v1; 010769 int saved_v2 = pCheck->v2; 010770 u8 savedIsInit = 0; 010771 010772 /* Check that the page exists 010773 */ 010774 checkProgress(pCheck); 010775 if( pCheck->mxErr==0 ) goto end_of_check; 010776 pBt = pCheck->pBt; 010777 usableSize = pBt->usableSize; 010778 if( iPage==0 ) return 0; 010779 if( checkRef(pCheck, iPage) ) return 0; 010780 pCheck->zPfx = "Tree %u page %u: "; 010781 pCheck->v1 = iPage; 010782 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){ 010783 checkAppendMsg(pCheck, 010784 "unable to get the page. error code=%d", rc); 010785 if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM; 010786 goto end_of_check; 010787 } 010788 010789 /* Clear MemPage.isInit to make sure the corruption detection code in 010790 ** btreeInitPage() is executed. */ 010791 savedIsInit = pPage->isInit; 010792 pPage->isInit = 0; 010793 if( (rc = btreeInitPage(pPage))!=0 ){ 010794 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 010795 checkAppendMsg(pCheck, 010796 "btreeInitPage() returns error code %d", rc); 010797 goto end_of_check; 010798 } 010799 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){ 010800 assert( rc==SQLITE_CORRUPT ); 010801 checkAppendMsg(pCheck, "free space corruption", rc); 010802 goto end_of_check; 010803 } 010804 data = pPage->aData; 010805 hdr = pPage->hdrOffset; 010806 010807 /* Set up for cell analysis */ 010808 pCheck->zPfx = "Tree %u page %u cell %u: "; 010809 contentOffset = get2byteNotZero(&data[hdr+5]); 010810 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 010811 010812 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the 010813 ** number of cells on the page. */ 010814 nCell = get2byte(&data[hdr+3]); 010815 assert( pPage->nCell==nCell ); 010816 if( pPage->leaf || pPage->intKey==0 ){ 010817 pCheck->nRow += nCell; 010818 } 010819 010820 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page 010821 ** immediately follows the b-tree page header. */ 010822 cellStart = hdr + 12 - 4*pPage->leaf; 010823 assert( pPage->aCellIdx==&data[cellStart] ); 010824 pCellIdx = &data[cellStart + 2*(nCell-1)]; 010825 010826 if( !pPage->leaf ){ 010827 /* Analyze the right-child page of internal pages */ 010828 pgno = get4byte(&data[hdr+8]); 010829 #ifndef SQLITE_OMIT_AUTOVACUUM 010830 if( pBt->autoVacuum ){ 010831 pCheck->zPfx = "Tree %u page %u right child: "; 010832 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010833 } 010834 #endif 010835 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010836 keyCanBeEqual = 0; 010837 }else{ 010838 /* For leaf pages, the coverage check will occur in the same loop 010839 ** as the other cell checks, so initialize the heap. */ 010840 heap = pCheck->heap; 010841 heap[0] = 0; 010842 } 010843 010844 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte 010845 ** integer offsets to the cell contents. */ 010846 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){ 010847 CellInfo info; 010848 010849 /* Check cell size */ 010850 pCheck->v2 = i; 010851 assert( pCellIdx==&data[cellStart + i*2] ); 010852 pc = get2byteAligned(pCellIdx); 010853 pCellIdx -= 2; 010854 if( pc<contentOffset || pc>usableSize-4 ){ 010855 checkAppendMsg(pCheck, "Offset %u out of range %u..%u", 010856 pc, contentOffset, usableSize-4); 010857 doCoverageCheck = 0; 010858 continue; 010859 } 010860 pCell = &data[pc]; 010861 pPage->xParseCell(pPage, pCell, &info); 010862 if( pc+info.nSize>usableSize ){ 010863 checkAppendMsg(pCheck, "Extends off end of page"); 010864 doCoverageCheck = 0; 010865 continue; 010866 } 010867 010868 /* Check for integer primary key out of range */ 010869 if( pPage->intKey ){ 010870 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){ 010871 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey); 010872 } 010873 maxKey = info.nKey; 010874 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */ 010875 } 010876 010877 /* Check the content overflow list */ 010878 if( info.nPayload>info.nLocal ){ 010879 u32 nPage; /* Number of pages on the overflow chain */ 010880 Pgno pgnoOvfl; /* First page of the overflow chain */ 010881 assert( pc + info.nSize - 4 <= usableSize ); 010882 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4); 010883 pgnoOvfl = get4byte(&pCell[info.nSize - 4]); 010884 #ifndef SQLITE_OMIT_AUTOVACUUM 010885 if( pBt->autoVacuum ){ 010886 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 010887 } 010888 #endif 010889 checkList(pCheck, 0, pgnoOvfl, nPage); 010890 } 010891 010892 if( !pPage->leaf ){ 010893 /* Check sanity of left child page for internal pages */ 010894 pgno = get4byte(pCell); 010895 #ifndef SQLITE_OMIT_AUTOVACUUM 010896 if( pBt->autoVacuum ){ 010897 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 010898 } 010899 #endif 010900 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey); 010901 keyCanBeEqual = 0; 010902 if( d2!=depth ){ 010903 checkAppendMsg(pCheck, "Child page depth differs"); 010904 depth = d2; 010905 } 010906 }else{ 010907 /* Populate the coverage-checking heap for leaf pages */ 010908 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1)); 010909 } 010910 } 010911 *piMinKey = maxKey; 010912 010913 /* Check for complete coverage of the page 010914 */ 010915 pCheck->zPfx = 0; 010916 if( doCoverageCheck && pCheck->mxErr>0 ){ 010917 /* For leaf pages, the min-heap has already been initialized and the 010918 ** cells have already been inserted. But for internal pages, that has 010919 ** not yet been done, so do it now */ 010920 if( !pPage->leaf ){ 010921 heap = pCheck->heap; 010922 heap[0] = 0; 010923 for(i=nCell-1; i>=0; i--){ 010924 u32 size; 010925 pc = get2byteAligned(&data[cellStart+i*2]); 010926 size = pPage->xCellSize(pPage, &data[pc]); 010927 btreeHeapInsert(heap, (pc<<16)|(pc+size-1)); 010928 } 010929 } 010930 assert( heap!=0 ); 010931 /* Add the freeblocks to the min-heap 010932 ** 010933 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header 010934 ** is the offset of the first freeblock, or zero if there are no 010935 ** freeblocks on the page. 010936 */ 010937 i = get2byte(&data[hdr+1]); 010938 while( i>0 ){ 010939 int size, j; 010940 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010941 size = get2byte(&data[i+2]); 010942 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */ 010943 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1)); 010944 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a 010945 ** big-endian integer which is the offset in the b-tree page of the next 010946 ** freeblock in the chain, or zero if the freeblock is the last on the 010947 ** chain. */ 010948 j = get2byte(&data[i]); 010949 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of 010950 ** increasing offset. */ 010951 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */ 010952 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */ 010953 i = j; 010954 } 010955 /* Analyze the min-heap looking for overlap between cells and/or 010956 ** freeblocks, and counting the number of untracked bytes in nFrag. 010957 ** 010958 ** Each min-heap entry is of the form: (start_address<<16)|end_address. 010959 ** There is an implied first entry the covers the page header, the cell 010960 ** pointer index, and the gap between the cell pointer index and the start 010961 ** of cell content. 010962 ** 010963 ** The loop below pulls entries from the min-heap in order and compares 010964 ** the start_address against the previous end_address. If there is an 010965 ** overlap, that means bytes are used multiple times. If there is a gap, 010966 ** that gap is added to the fragmentation count. 010967 */ 010968 nFrag = 0; 010969 prev = contentOffset - 1; /* Implied first min-heap entry */ 010970 while( btreeHeapPull(heap,&x) ){ 010971 if( (prev&0xffff)>=(x>>16) ){ 010972 checkAppendMsg(pCheck, 010973 "Multiple uses for byte %u of page %u", x>>16, iPage); 010974 break; 010975 }else{ 010976 nFrag += (x>>16) - (prev&0xffff) - 1; 010977 prev = x; 010978 } 010979 } 010980 nFrag += usableSize - (prev&0xffff) - 1; 010981 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments 010982 ** is stored in the fifth field of the b-tree page header. 010983 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the 010984 ** number of fragmented free bytes within the cell content area. 010985 */ 010986 if( heap[0]==0 && nFrag!=data[hdr+7] ){ 010987 checkAppendMsg(pCheck, 010988 "Fragmentation of %u bytes reported as %u on page %u", 010989 nFrag, data[hdr+7], iPage); 010990 } 010991 } 010992 010993 end_of_check: 010994 if( !doCoverageCheck ) pPage->isInit = savedIsInit; 010995 releasePage(pPage); 010996 pCheck->zPfx = saved_zPfx; 010997 pCheck->v1 = saved_v1; 010998 pCheck->v2 = saved_v2; 010999 return depth+1; 011000 } 011001 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011002 011003 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 011004 /* 011005 ** This routine does a complete check of the given BTree file. aRoot[] is 011006 ** an array of pages numbers were each page number is the root page of 011007 ** a table. nRoot is the number of entries in aRoot. 011008 ** 011009 ** A read-only or read-write transaction must be opened before calling 011010 ** this function. 011011 ** 011012 ** Write the number of error seen in *pnErr. Except for some memory 011013 ** allocation errors, an error message held in memory obtained from 011014 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 011015 ** returned. If a memory allocation error occurs, NULL is returned. 011016 ** 011017 ** If the first entry in aRoot[] is 0, that indicates that the list of 011018 ** root pages is incomplete. This is a "partial integrity-check". This 011019 ** happens when performing an integrity check on a single table. The 011020 ** zero is skipped, of course. But in addition, the freelist checks 011021 ** and the checks to make sure every page is referenced are also skipped, 011022 ** since obviously it is not possible to know which pages are covered by 011023 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist 011024 ** checks are still performed. 011025 */ 011026 int sqlite3BtreeIntegrityCheck( 011027 sqlite3 *db, /* Database connection that is running the check */ 011028 Btree *p, /* The btree to be checked */ 011029 Pgno *aRoot, /* An array of root pages numbers for individual trees */ 011030 Mem *aCnt, /* Memory cells to write counts for each tree to */ 011031 int nRoot, /* Number of entries in aRoot[] */ 011032 int mxErr, /* Stop reporting errors after this many */ 011033 int *pnErr, /* OUT: Write number of errors seen to this variable */ 011034 char **pzOut /* OUT: Write the error message string here */ 011035 ){ 011036 Pgno i; 011037 IntegrityCk sCheck; 011038 BtShared *pBt = p->pBt; 011039 u64 savedDbFlags = pBt->db->flags; 011040 char zErr[100]; 011041 int bPartial = 0; /* True if not checking all btrees */ 011042 int bCkFreelist = 1; /* True to scan the freelist */ 011043 VVA_ONLY( int nRef ); 011044 011045 assert( nRoot>0 ); 011046 assert( aCnt!=0 ); 011047 011048 /* aRoot[0]==0 means this is a partial check */ 011049 if( aRoot[0]==0 ){ 011050 assert( nRoot>1 ); 011051 bPartial = 1; 011052 if( aRoot[1]!=1 ) bCkFreelist = 0; 011053 } 011054 011055 sqlite3BtreeEnter(p); 011056 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 011057 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) ); 011058 assert( nRef>=0 ); 011059 memset(&sCheck, 0, sizeof(sCheck)); 011060 sCheck.db = db; 011061 sCheck.pBt = pBt; 011062 sCheck.pPager = pBt->pPager; 011063 sCheck.nCkPage = btreePagecount(sCheck.pBt); 011064 sCheck.mxErr = mxErr; 011065 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 011066 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL; 011067 if( sCheck.nCkPage==0 ){ 011068 goto integrity_ck_cleanup; 011069 } 011070 011071 sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1); 011072 if( !sCheck.aPgRef ){ 011073 checkOom(&sCheck); 011074 goto integrity_ck_cleanup; 011075 } 011076 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize ); 011077 if( sCheck.heap==0 ){ 011078 checkOom(&sCheck); 011079 goto integrity_ck_cleanup; 011080 } 011081 011082 i = PENDING_BYTE_PAGE(pBt); 011083 if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i); 011084 011085 /* Check the integrity of the freelist 011086 */ 011087 if( bCkFreelist ){ 011088 sCheck.zPfx = "Freelist: "; 011089 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 011090 get4byte(&pBt->pPage1->aData[36])); 011091 sCheck.zPfx = 0; 011092 } 011093 011094 /* Check all the tables. 011095 */ 011096 #ifndef SQLITE_OMIT_AUTOVACUUM 011097 if( !bPartial ){ 011098 if( pBt->autoVacuum ){ 011099 Pgno mx = 0; 011100 Pgno mxInHdr; 011101 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i]; 011102 mxInHdr = get4byte(&pBt->pPage1->aData[52]); 011103 if( mx!=mxInHdr ){ 011104 checkAppendMsg(&sCheck, 011105 "max rootpage (%u) disagrees with header (%u)", 011106 mx, mxInHdr 011107 ); 011108 } 011109 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){ 011110 checkAppendMsg(&sCheck, 011111 "incremental_vacuum enabled with a max rootpage of zero" 011112 ); 011113 } 011114 } 011115 #endif 011116 testcase( pBt->db->flags & SQLITE_CellSizeCk ); 011117 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk; 011118 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 011119 sCheck.nRow = 0; 011120 if( aRoot[i] ){ 011121 i64 notUsed; 011122 #ifndef SQLITE_OMIT_AUTOVACUUM 011123 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){ 011124 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 011125 } 011126 #endif 011127 sCheck.v0 = aRoot[i]; 011128 checkTreePage(&sCheck, aRoot[i], ¬Used, LARGEST_INT64); 011129 } 011130 sqlite3MemSetArrayInt64(aCnt, i, sCheck.nRow); 011131 } 011132 pBt->db->flags = savedDbFlags; 011133 011134 /* Make sure every page in the file is referenced 011135 */ 011136 if( !bPartial ){ 011137 for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){ 011138 #ifdef SQLITE_OMIT_AUTOVACUUM 011139 if( getPageReferenced(&sCheck, i)==0 ){ 011140 checkAppendMsg(&sCheck, "Page %u: never used", i); 011141 } 011142 #else 011143 /* If the database supports auto-vacuum, make sure no tables contain 011144 ** references to pointer-map pages. 011145 */ 011146 if( getPageReferenced(&sCheck, i)==0 && 011147 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 011148 checkAppendMsg(&sCheck, "Page %u: never used", i); 011149 } 011150 if( getPageReferenced(&sCheck, i)!=0 && 011151 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 011152 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i); 011153 } 011154 #endif 011155 } 011156 } 011157 011158 /* Clean up and report errors. 011159 */ 011160 integrity_ck_cleanup: 011161 sqlite3PageFree(sCheck.heap); 011162 sqlite3_free(sCheck.aPgRef); 011163 *pnErr = sCheck.nErr; 011164 if( sCheck.nErr==0 ){ 011165 sqlite3_str_reset(&sCheck.errMsg); 011166 *pzOut = 0; 011167 }else{ 011168 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg); 011169 } 011170 /* Make sure this analysis did not leave any unref() pages. */ 011171 assert( nRef==sqlite3PagerRefcount(pBt->pPager) ); 011172 sqlite3BtreeLeave(p); 011173 return sCheck.rc; 011174 } 011175 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 011176 011177 /* 011178 ** Return the full pathname of the underlying database file. Return 011179 ** an empty string if the database is in-memory or a TEMP database. 011180 ** 011181 ** The pager filename is invariant as long as the pager is 011182 ** open so it is safe to access without the BtShared mutex. 011183 */ 011184 const char *sqlite3BtreeGetFilename(Btree *p){ 011185 assert( p->pBt->pPager!=0 ); 011186 return sqlite3PagerFilename(p->pBt->pPager, 1); 011187 } 011188 011189 /* 011190 ** Return the pathname of the journal file for this database. The return 011191 ** value of this routine is the same regardless of whether the journal file 011192 ** has been created or not. 011193 ** 011194 ** The pager journal filename is invariant as long as the pager is 011195 ** open so it is safe to access without the BtShared mutex. 011196 */ 011197 const char *sqlite3BtreeGetJournalname(Btree *p){ 011198 assert( p->pBt->pPager!=0 ); 011199 return sqlite3PagerJournalname(p->pBt->pPager); 011200 } 011201 011202 /* 011203 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE 011204 ** to describe the current transaction state of Btree p. 011205 */ 011206 int sqlite3BtreeTxnState(Btree *p){ 011207 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 011208 return p ? p->inTrans : 0; 011209 } 011210 011211 #ifndef SQLITE_OMIT_WAL 011212 /* 011213 ** Run a checkpoint on the Btree passed as the first argument. 011214 ** 011215 ** Return SQLITE_LOCKED if this or any other connection has an open 011216 ** transaction on the shared-cache the argument Btree is connected to. 011217 ** 011218 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 011219 */ 011220 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 011221 int rc = SQLITE_OK; 011222 if( p ){ 011223 BtShared *pBt = p->pBt; 011224 sqlite3BtreeEnter(p); 011225 if( pBt->inTransaction!=TRANS_NONE ){ 011226 rc = SQLITE_LOCKED; 011227 }else{ 011228 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt); 011229 } 011230 sqlite3BtreeLeave(p); 011231 } 011232 return rc; 011233 } 011234 #endif 011235 011236 /* 011237 ** Return true if there is currently a backup running on Btree p. 011238 */ 011239 int sqlite3BtreeIsInBackup(Btree *p){ 011240 assert( p ); 011241 assert( sqlite3_mutex_held(p->db->mutex) ); 011242 return p->nBackup!=0; 011243 } 011244 011245 /* 011246 ** This function returns a pointer to a blob of memory associated with 011247 ** a single shared-btree. The memory is used by client code for its own 011248 ** purposes (for example, to store a high-level schema associated with 011249 ** the shared-btree). The btree layer manages reference counting issues. 011250 ** 011251 ** The first time this is called on a shared-btree, nBytes bytes of memory 011252 ** are allocated, zeroed, and returned to the caller. For each subsequent 011253 ** call the nBytes parameter is ignored and a pointer to the same blob 011254 ** of memory returned. 011255 ** 011256 ** If the nBytes parameter is 0 and the blob of memory has not yet been 011257 ** allocated, a null pointer is returned. If the blob has already been 011258 ** allocated, it is returned as normal. 011259 ** 011260 ** Just before the shared-btree is closed, the function passed as the 011261 ** xFree argument when the memory allocation was made is invoked on the 011262 ** blob of allocated memory. The xFree function should not call sqlite3_free() 011263 ** on the memory, the btree layer does that. 011264 */ 011265 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 011266 BtShared *pBt = p->pBt; 011267 sqlite3BtreeEnter(p); 011268 if( !pBt->pSchema && nBytes ){ 011269 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 011270 pBt->xFreeSchema = xFree; 011271 } 011272 sqlite3BtreeLeave(p); 011273 return pBt->pSchema; 011274 } 011275 011276 /* 011277 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 011278 ** btree as the argument handle holds an exclusive lock on the 011279 ** sqlite_schema table. Otherwise SQLITE_OK. 011280 */ 011281 int sqlite3BtreeSchemaLocked(Btree *p){ 011282 int rc; 011283 assert( sqlite3_mutex_held(p->db->mutex) ); 011284 sqlite3BtreeEnter(p); 011285 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK); 011286 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 011287 sqlite3BtreeLeave(p); 011288 return rc; 011289 } 011290 011291 011292 #ifndef SQLITE_OMIT_SHARED_CACHE 011293 /* 011294 ** Obtain a lock on the table whose root page is iTab. The 011295 ** lock is a write lock if isWritelock is true or a read lock 011296 ** if it is false. 011297 */ 011298 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 011299 int rc = SQLITE_OK; 011300 assert( p->inTrans!=TRANS_NONE ); 011301 if( p->sharable ){ 011302 u8 lockType = READ_LOCK + isWriteLock; 011303 assert( READ_LOCK+1==WRITE_LOCK ); 011304 assert( isWriteLock==0 || isWriteLock==1 ); 011305 011306 sqlite3BtreeEnter(p); 011307 rc = querySharedCacheTableLock(p, iTab, lockType); 011308 if( rc==SQLITE_OK ){ 011309 rc = setSharedCacheTableLock(p, iTab, lockType); 011310 } 011311 sqlite3BtreeLeave(p); 011312 } 011313 return rc; 011314 } 011315 #endif 011316 011317 #ifndef SQLITE_OMIT_INCRBLOB 011318 /* 011319 ** Argument pCsr must be a cursor opened for writing on an 011320 ** INTKEY table currently pointing at a valid table entry. 011321 ** This function modifies the data stored as part of that entry. 011322 ** 011323 ** Only the data content may only be modified, it is not possible to 011324 ** change the length of the data stored. If this function is called with 011325 ** parameters that attempt to write past the end of the existing data, 011326 ** no modifications are made and SQLITE_CORRUPT is returned. 011327 */ 011328 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 011329 int rc; 011330 assert( cursorOwnsBtShared(pCsr) ); 011331 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 011332 assert( pCsr->curFlags & BTCF_Incrblob ); 011333 011334 rc = restoreCursorPosition(pCsr); 011335 if( rc!=SQLITE_OK ){ 011336 return rc; 011337 } 011338 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 011339 if( pCsr->eState!=CURSOR_VALID ){ 011340 return SQLITE_ABORT; 011341 } 011342 011343 /* Save the positions of all other cursors open on this table. This is 011344 ** required in case any of them are holding references to an xFetch 011345 ** version of the b-tree page modified by the accessPayload call below. 011346 ** 011347 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 011348 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 011349 ** saveAllCursors can only return SQLITE_OK. 011350 */ 011351 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 011352 assert( rc==SQLITE_OK ); 011353 011354 /* Check some assumptions: 011355 ** (a) the cursor is open for writing, 011356 ** (b) there is a read/write transaction open, 011357 ** (c) the connection holds a write-lock on the table (if required), 011358 ** (d) there are no conflicting read-locks, and 011359 ** (e) the cursor points at a valid row of an intKey table. 011360 */ 011361 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 011362 return SQLITE_READONLY; 011363 } 011364 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 011365 && pCsr->pBt->inTransaction==TRANS_WRITE ); 011366 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 011367 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 011368 assert( pCsr->pPage->intKey ); 011369 011370 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 011371 } 011372 011373 /* 011374 ** Mark this cursor as an incremental blob cursor. 011375 */ 011376 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 011377 pCur->curFlags |= BTCF_Incrblob; 011378 pCur->pBtree->hasIncrblobCur = 1; 011379 } 011380 #endif 011381 011382 /* 011383 ** Set both the "read version" (single byte at byte offset 18) and 011384 ** "write version" (single byte at byte offset 19) fields in the database 011385 ** header to iVersion. 011386 */ 011387 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 011388 BtShared *pBt = pBtree->pBt; 011389 int rc; /* Return code */ 011390 011391 assert( iVersion==1 || iVersion==2 ); 011392 011393 /* If setting the version fields to 1, do not automatically open the 011394 ** WAL connection, even if the version fields are currently set to 2. 011395 */ 011396 pBt->btsFlags &= ~BTS_NO_WAL; 011397 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 011398 011399 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0); 011400 if( rc==SQLITE_OK ){ 011401 u8 *aData = pBt->pPage1->aData; 011402 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 011403 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0); 011404 if( rc==SQLITE_OK ){ 011405 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 011406 if( rc==SQLITE_OK ){ 011407 aData[18] = (u8)iVersion; 011408 aData[19] = (u8)iVersion; 011409 } 011410 } 011411 } 011412 } 011413 011414 pBt->btsFlags &= ~BTS_NO_WAL; 011415 return rc; 011416 } 011417 011418 /* 011419 ** Return true if the cursor has a hint specified. This routine is 011420 ** only used from within assert() statements 011421 */ 011422 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){ 011423 return (pCsr->hints & mask)!=0; 011424 } 011425 011426 /* 011427 ** Return true if the given Btree is read-only. 011428 */ 011429 int sqlite3BtreeIsReadonly(Btree *p){ 011430 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 011431 } 011432 011433 /* 011434 ** Return the size of the header added to each page by this module. 011435 */ 011436 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } 011437 011438 /* 011439 ** If no transaction is active and the database is not a temp-db, clear 011440 ** the in-memory pager cache. 011441 */ 011442 void sqlite3BtreeClearCache(Btree *p){ 011443 BtShared *pBt = p->pBt; 011444 if( pBt->inTransaction==TRANS_NONE ){ 011445 sqlite3PagerClearCache(pBt->pPager); 011446 } 011447 } 011448 011449 #if !defined(SQLITE_OMIT_SHARED_CACHE) 011450 /* 011451 ** Return true if the Btree passed as the only argument is sharable. 011452 */ 011453 int sqlite3BtreeSharable(Btree *p){ 011454 return p->sharable; 011455 } 011456 011457 /* 011458 ** Return the number of connections to the BtShared object accessed by 011459 ** the Btree handle passed as the only argument. For private caches 011460 ** this is always 1. For shared caches it may be 1 or greater. 011461 */ 011462 int sqlite3BtreeConnectionCount(Btree *p){ 011463 testcase( p->sharable ); 011464 return p->pBt->nRef; 011465 } 011466 #endif