Contents: /** main() LINE 50 */ /** mysql_select() explained. LINE 2870 */ /** Sergei's article on mysql overview LINE 490 */ /** Header Files overview LINE 630 */ /** mysql_parse() explained. LINE 845 */ /** Notes on locking LINE 1030 */ /** mysql_install_plugin() explained. LINE 1130 */ /** Notes on Lex and Yacc LINE 1260 */ /** NDB pushdown logic explained. LINE 1480 */ /** libmysqld.a embedded library explained LINE 1990 */ /** Optimization, JOIN::prepare(), optimize(), etc closer look. LINE 2010 */ Also JOIN structures explained. /** mysql_select() explained with example query. Closer look at subselect() LINE 2900 */ /** MySQL Join syntax: LINE 3225 */ /** Postgres Vs MySQL Comparision LINE 3300 */ Tracing Range Query : /** LINE 3410 */ Derived Tables Execution : /** LINE 3700 */ Understanding Field Class /** LINE 3810 */ Use_of_List_in_MySQL mysys_library Threads: (1) signal_hand (arg=0x0) at mysqld.cc:2826 (2) main()=>handle_connections_sockets() =>select at mysqld.cc:5005 (3) handle_one_connection() #0 mysql_parse (thd,"create table t(i integer)", ...) at sql_parse.cc:5719 #1 dispatch_command (command=COM_QUERY, thd=0x8da8928, packet=0x8df5d09 "create table t(i integer)", packet_length=25) at sql_parse.cc:1045 #2 0x0821a519 in do_command (thd) at sql_parse.cc:722 #3 0x08207abf in handle_one_connection (thd) at sql_connect.cc:1134 Main is defined in sqld.cc : Global Variables: char *mysql_data_home; I_List threads; I_List key_caches; static I_List thread_cache; struct system_variables global_system_variables; struct system_variables max_system_variables; struct system_status_var global_status_var; /* Locks to access global variables ... */ pthread_mutex_t LOCK_mysql_create_db, LOCK_Acl, LOCK_open, LOCK_error_log,... ; rw_lock_t LOCK_system_variables_hash, ....; // can be shared. static char *mysqld_user; static char *default_character_set_name; static char *default_collation_name; static char *default_storage_engine_str; static char compiled_default_collation_name[]= MYSQL_DEFAULT_COLLATION_NAME; Note: handlerton - one per engine; (e.g. callbacks: discover, comit, rollback, table_exists_in_engine etc) hanlder - one per table (e.g. ha_write_row(buf), ha_update_row(buf), etc Note: sql/mysql_priv.h includes all damn things. /** main() explained LINE 50 */ main(){ my_init() { pthread_init(); init homedir with getenv("HOME"); etc } Logger:init_base() { init_error_log(); to initialize log file. } In windows, WSAStartup () needs to be called before socket calls. init_common_variables() { ... read from config file... default storage engine, etc etc...} init_signals(); initialize thread stack size; 200K! (void) thr_setconcurrency(concurrency); // 10 by default init_ssl(); // load DES key file etc init_server_components() { query_cache_init(); randominit(&sql_rand,(ulong) server_start_time,(ulong) server_start_time/2); setup_fpu(); // setup for 64 bit floating point using _FPU_SETCW() command init_slave_list(); // Init slave list hash and mutex xid_cache_init(){ Init xid_cache hash; } Init some key caches; plugin_init(){ by default federated & ndb cluster is disabled; register other built-in plugins; //e.g. binlog - Pseudo storage engine to represent binlog // partition - Partition storage engine helper (Mike Ronstrom) // ARCHIVE, CSV - by Brian Aker // blackhole // Falcon // FALCON_SYSTEM_MEMORY_DETAIL, FALCON_SYSTEM_MEMORY_SUMMARY, ... // MEMORY - Hash based, memory tables, useful for temp tables // MARIA - Crash safe tables with MyISAM heritage // MRG_MYISAM - Collection of identical MyISAM tables plugin_initialize(MyISAM and CSV storage engines); } } network_init(){ thread_scheduler.init() (i.e. scheduler_functions::init()); // some trivial init // class scheduler_functions helps to create one-per connection thread or not set_ports(){ set mysqld_port, mysqld_unix_port to getenv "MYSQL_TCP_PORT" or "MYSQL_UNIX_PORT" resply. Defaults: tcpport=3306, unixport=/tmp/mysql.sock (or MySQL namedpipe in windows) } bind; listen; } my_str_malloc= &my_str_malloc_mysqld; // this points to my_malloc(); similar logic for free // my_malloc() calls malloc() with some standard error printing mechanism start_signal_handler(){ create pid file; create pthread signal_hand() which loops with sigwait() to receive signal. } mysql_rm_tmp_tables(void); acl_init(){ // init user/db privileges from mysql database table new THD; acl_reload(thd){ Table *table ; // init this table with name "db" , host, user etc. priv tables // Sanity check: open and close tables: // Look for .frm files first. If not, // Try to discover the tables from engines. If they exist, create .frm files // sql/handler.cc open_table()==> { int ha_create_table_from_engine(THD* thd, const char *db, const char *name) { ha_discover(){ plugin_foreach(thd, discover_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &args); // The plugin types: UDR, StorageEngine, parser, daemon, InfSchema(I_S), // AUDIT plugins. mysql/plugin.h discover_handlerton(){ hton->discover(hton, thd, vargs->db, vargs->name, vargs->frmblob, vargs->frmlen); e.g. int archive_discover(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen){ discovers table of given name of type archive and initializes frmblob and frmlen accordingly. file: storage/archive/ha_archive.cc } } } } init_read_record(&read_record_info,thd,table= tables[0].table,NULL,1,0); // init so that you can read the record next using calls like: // info->file->rnd_next(info->record); i.e. random next. not ordered based on the read info, initialize &acl_hosts, &acl_users, &acl_dbs !!! } // end of open_tables(), open_table() } // end of acl_reload() } // end of acl_init() my_tz_init(); // initialize time zone variables grant_init(){ // sql_acl.cc use tables, tables_priv and columns_priv tables in "mysql" database; Init column_priv_hash accordingly grant_reload_procs_priv(THD *thd) { Use "procs_priv" table in "mysql" database to initialize procs_priv_hash, func_priv_hash } } servers_init(){ init &servers_cache hash; severs_reload() ==> read "servers" table from "mysql" database and initialize servers_cache hash accordingly. } udf_init(){ Read "mysql.func" table (which contains set of udf's) Init udf_hash accordingly } init_status_var() { sorts all_status_var dynamic array for use with show status } init_slave(){ Init global: active_mi = new Master_info; init_master_info(active_mi, master_info_file ="master.info", relay_log_info_file, 1, (SLAVE_IO | SLAVE_SQL)); If needed, call: start_slave_threads(1 /* need mutex */, 0 /* no wait for start*/, active_mi, master_info_file, relay_log_info_file, SLAVE_IO | SLAVE_SQL) ; } if (invoked with --bootstrap option){ Execute commands from stdin; Used by mysql_install_db script create thread handle_bootstrap(){ thd->init_for_queries(); // call once before mysql_parse(); mysql_parse(); // read queries through net interface. } mysqld_install_cmd_line="mysqld_bootstrap --bootstrap \ --basedir=$basedir --datadir=$ldata --log-warnings=0 --loose-skip-innodb \ --loose-skip-ndbcluster --user=root --max_allowed_packet=8M \ --net_buffer_length=16K" Execute these 3 queries : fill_help_tables.sql mysql_system_tables.sql mysql_system_tables_data.sql } execute_ddl_log_recovery(){ Read ddl_log.log recovery log in data dir and try to do recovery if needed. } only in windows, create_shutdown_thread(); // To respond to Stop Service handle_connections_sockets(){ select ( unix socket and ip socket ) { if there is connection: create_new_thread(thd); } } } // end of main() Reading Tables in server : Typical sequence : table= tables.table; init_read_record(&read_record_info, new_thd, table, NULL,1,0); table->use_all_columns(); // sets bit maps to use all columns for next read record while (!(error= read_record_info.read_record(&read_record_info))) use the read column; The read_record function could be either rr_sequential, rr_index, etc. Note: void init_read_record(READ_RECORD *info,THD *thd, TABLE *table, SQL_SELECT *select, int use_record_cache, bool print_error) is defined in libmysqld/records.cc Question: Diff between update log and bin log ? Look at Example Storage Engine : The CSV Engine: ha_tina.h : typedef struct st_tina_share { char *table_name; ... } TINA_SHARE; // Per table struct class ha_tina: public handler { THR_LOCK_DATA lock; /* MySQL lock */ TINA_SHARE *share; /* Shared lock info */ ... Transparent_file *file_buff; File data_file; /* File handler for readers */ uint local_data_file_version; /* Saved version of the data file used */ MEM_ROOT blobroot; public: ha_tina(handlerton *hton, TABLE_SHARE *table_arg); ~ha_tina(); const char *table_type() const { return "CSV"; } const char *index_type(uint inx) { return "NONE"; } const char **bas_ext() const; // Opportunity for SE to tell server what it supports. ulonglong table_flags() const { return (HA_NO_TRANSACTIONS | HA_REC_NOT_IN_SEQ | HA_NO_AUTO_INCREMENT | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE); } ulong index_flags(uint idx, uint part, bool all_parts) const { /* We will never have indexes so this will never be called */ return 0; } uint max_record_length() const { return HA_MAX_REC_LENGTH; } uint max_keys() const { return 0; } uint max_key_parts() const { return 0; } uint max_key_length() const { return 0; } /* Called in test_quick_select to determine if indexes should be used. */ virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; } /* The next method will never be called */ virtual bool fast_key_read() { return 1;} /* TODO: return actual upper bound of number of records in the table. (e.g. save number of records seen on full table scan and/or use file size as upper bound) */ ha_rows estimate_rows_upper_bound() { return HA_POS_ERROR; } int open(const char *name, int mode, uint open_options); int close(void); int write_row(uchar * buf); int update_row(const uchar * old_data, uchar * new_data); int delete_row(const uchar * buf); int rnd_init(bool scan=1); int rnd_next(uchar *buf); int rnd_pos(uchar * buf, uchar *pos); bool check_and_repair(THD *thd); int check(THD* thd, HA_CHECK_OPT* check_opt); bool is_crashed() const; int rnd_end(); int repair(THD* thd, HA_CHECK_OPT* check_opt); /* This is required for SQL layer to know that we support autorepair */ bool auto_repair() const { return 1; } void position(const uchar *record); int info(uint); int extra(enum ha_extra_function operation); int delete_all_rows(void); int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info); bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); int external_lock(THD *thd, int lock_type); THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type); /* These functions used to get/update status of the handler. Needed to enable concurrent inserts. */ void get_status(); void update_status(); /* The following methods were added just for TINA */ int encode_quote(uchar *buf); int find_current_row(uchar *buf); int chain_append(); }; Note: mysql_declare_plugin(csv) { MYSQL_STORAGE_ENGINE_PLUGIN, &csv_storage_engine, "CSV", "Brian Aker, MySQL AB", "CSV storage engine", PLUGIN_LICENSE_GPL, tina_init_func, /* Plugin Init */ tina_done_func, /* Plugin Deinit */ 0x0100 /* 1.0 */, NULL, /* status variables */ NULL, /* system variables */ NULL /* config options */ } mysql_declare_plugin_end; This defines a variable: struct st_mysql_plugin builtin_csv_plugin[] = { { given values }, {0, 0, ... } } ; /* All table scans call this first. The order of a table scan is: ha_tina::store_lock ha_tina::external_lock ha_tina::info ha_tina::rnd_init ha_tina::extra ENUM HA_EXTRA_CACHE Cash record in HA_rrnd() ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::rnd_next ha_tina::extra ENUM HA_EXTRA_NO_CACHE End cacheing of records (def) ha_tina::external_lock ha_tina::extra ENUM HA_EXTRA_RESET Reset database to after open Each call to ::rnd_next() represents a row returned in the can. When no more rows can be returned, rnd_next() returns a value of HA_ERR_END_OF_FILE. The ::info() call is just for the optimizer. */ Note: include/mysql/mysql_version.h : #define MYSQL_VERSION_ID 60006 in ha_tina.cc : struct st_mysql_storage_engine csv_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; /* this is version << 8. i.e. 60006*256 */ If you break the code at tina_create_handler in ha_tina.cc, and issue create table (i integer) engine=csv, this is what you get in gdb : (gdb) where #0 tina_create_handler (hton=0xa2d0df0, table=0xa6e07aac, mem_root=0xa6e07ae4) at ha_tina.cc:437 #1 0x08420aef in get_new_handler (share=0xa6e07aac, alloc=0xa6e07ae4, db_type=0xa2d0df0) at handler.cc:244 #2 0x08359aa5 in open_binary_frm (thd=0xa7ee428, share=0xa6e07aac, head=0xa6e07344 "þ\001\t\021\003", file=72) at table.cc:1200 #3 0x0835b647 in open_table_def (thd=0xa7ee428, share=0xa6e07aac, db_flags=0) at table.cc:634 #4 0x0841cf2b in ha_create_table (thd=0xa7ee428, path=0xa6e081b4 "./mysql/thava6", db=0xa88c5b0 "mysql", table_name=0xa88c388 "thava6", create_info=0xa6e08684, update_create_info=false) at handler.cc:3491 #5 0x083d4ac1 in rea_create_table (thd=0xa7ee428, path=0xa6e081b4 "./mysql/thava6", db=0xa88c5b0 "mysql", table_name=0xa88c388 "thava6", create_info=0xa6e08684, create_fields=@0xa6e08b90, keys=0, key_info=0xa88cb08, file=0xa844ca8) at unireg.cc:482 #6 0x0843c235 in mysql_create_table_no_lock (thd=0xa7ee428, db=0xa88c5b0 "mysql", table_name=0xa88c388 "thava6", create_info=0xa6e08684, alter_info=0xa6e08b6c, internal_tmp_table=false, select_field_count=0) at sql_table.cc:3641 #7 0x0843ca36 in mysql_create_table (thd=0xa7ee428, db=0xa88c5b0 "mysql", table_name=0xa88c388 "thava6", create_info=0xa6e08684, alter_info=0xa6e08b6c, internal_tmp_table=false, select_field_count=0) at sql_table.cc:3790 #8 0x08304f46 in mysql_execute_command (thd=0xa7ee428) at sql_parse.cc:2452 #9 0x0830d03d in mysql_parse (thd=0xa7ee428, inBuf=0xa88c2e8 "create table thava6(i integer) engine=csv", length=41, found_semicolon=0xa6e09280) at sql_parse.cc:5811 #10 0x0830dabb in dispatch_command (command=COM_QUERY, thd=0xa7ee428, packet=0xa83cc49 "create table thava6(i integer) engine=csv", packet_length=41) at sql_parse.cc:1051 #11 0x0830ed8d in do_command (thd=0xa7ee428) at sql_parse.cc:724 #12 0x082fb587 in handle_one_connection (arg=0xa7ee428) at sql_connect.cc:1153 #13 0x0081443b in start_thread () from /lib/libpthread.so.0 #14 0x0076bfde in clone () from /lib/libc.so.6 When you run mysql -u root mysql, this is the sqeuence that happens : client executes : "show databases" mysql_parse () => mysql_execute_command () => execute_sqlcom_select(thd, all_tables); => handle_select() (sql_select.cc) => mysql_select() { new Join(); join->prepare(); /* even single select is handled as special case of join. */ join-> optimize(); join-> exec(); Create table logic: (gdb) where mysql_create_frm (thd=0xb832120, file_name=0xa781af48 "./mysql/mytable.frm", ... #0 rea_create_table (thd, "./mysql/mytable", db= "mysql", "mytable", create_info, create_fields, keys=1, key_info, file=0xb858608) at unireg.cc:467 #1 mysql_create_table_no_lock () sql_table.cc:3641 #2 mysql_create_table () #3 mysql_execute_command () #4 mysql_parse () Conditional Pushdown: sql_select.cc: make_join_select() : if (thd->variables.engine_condition_pushdown) { COND *push_cond= make_cond_for_table(tmp, current_map, current_map, 0); if (push_cond) /* Push condition to handler */ if (!tab->table->file->cond_push(push_cond)) tab->table->file->pushed_cond= push_cond; Note: tab=JOIN_TAB; table=struct st_table; handler *file; virtual const COND *cond_push(const COND *cond) { return cond; } Return 0 means, the condition will be honored by table. } } typedef struct st_join_table { TABLE *table; SQL_SELECT *select; COND *select_cond; QUICK_SELECT_I *quick; .. } JOIN_TAB; JOIN_TAB tab; ha_tina.cc : struct st_mysql_storage_engine csv_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; /* handlertons of different MySQL releases are incompatible */ #define MYSQL_HANDLERTON_INTERFACE_VERSION (MYSQL_VERSION_ID << 8) #define MYSQL_VERSION_ID 60006 /* Expressed in decimal for 6.0.06*/ include/my_base.h contains useful flags info for ha_open(), ha_extra(), etc flags sql/table.cc create_frm()creates .frm file: what is ha_myisam::extra( enum ha_extra_function operation ) ? Ans: It is command function from optimizer to handler. e.g. HA_EXTRA_FORCE_REOPEN will caluse myisam to reopen the underlying file. The return value is error 1/0. What is diff beteen ha_myisam::extra() and extra_opt() ? Ans: Both are handled by mi_extra() function. extra_opt() passes flag like: HA_WRITE_CACHE with cache size indicating that it will be good for the handler to init write cache for performance. It is upto storage engine to really bother about this optimization or ignore. Notes from Sergei's article : /** Sergei's article on mysql overview LINE 490 */ ====================================================================== Communication Protocol : vio - Virtual IO - for ssl/tcp/socket layer libmysqld/net_serv.cc -- On the wire highlevel protocol ====================================================================== Parsing: LARL(1) parser built by bison Hand written lexer - flex is not used The query is parsed into memory data structures - table lists, select trees, expression trees. No bytecode. class Item Base class for expressions fields, constants, functions, variables, placeholders, subqueries - everything is an Item properties: name, type, maybe_null, null_value, charset, collation, used_tables, const_item, args[ ], ... methods: val_int(), val_str(), val_real(), val_decimal(); reset(), add(); eq(), print(), fix_fields(), ... Evaluating an expression of any complexity is as simple as expr->val_int() // or val_str(), val_real(), val_decimal() ====================================================================== Preparing For Execution: Checking privileges ------------------- 1878 res= check_table_access(thd, lex->exchange ? SELECT_ACL | FILE_ACL : SELECT_ACL, all_tables, 0); Opening and locking tables deadlock-free locking: all tables are locked at once and always in the same order enum enum_thr_lock_result thr_multilock (THR_LOCK_DATA **data, uint count, THR_LOCK_OWNER *owner) { sort_locks(data,count); for (pos=data,end=data+count; pos < end ; pos++) { enum enum_thr_lock_result result= thr_lock(*pos, owner, (*pos)->type); if (result != THR_LOCK_SUCCESS) { /* Aborted */ thr_multi_unlock(data,(uint) (pos-data)); DBUG_RETURN(result); } } DBUG_RETURN(THR_LOCK_SUCCESS); } Question: Does this mean *all tables* are locked before storage engines get requests to retrive rows ? innodb implements row level locks. No complex locking protocols used by server! * this is true even in complex statements with routines, subqueries, views, triggers * Calling item->fix_fields() fields find their tables items define basic properties like maybe_null, const_item, used_tables ====================================================================== Optimizing: sql/sql_select.cc int JOIN::optimize() Equality propagation: WHERE f1 = f2 AND f2 = f3 ? f1 = f2 = f3 optimize_cond(JOIN *join, COND *conds, List *join_list, Item::cond_result *cond_value) { Equality propagation: WHERE f1 = f2 AND f2 = f3 ? f1 = f2 = f3 /* change field1 = field2 to field2 = const for each found field1 = const */ /* Remove all instances of item == item */ } NOT elimination: WHERE NOT f1 > 5 ? f1 <= 5 Item *negate_expression(THD *thd, Item *expr) { .... } ====================================================================== Choosing an execution plan Getting statistics from the storage engine index cardinality, table scan and index lookup costs Creating a list of possible indexes Performing search for the best execution plan best plan is the one that has the lowest cost exhaustive search (up to MySQL 4.1), or greedy search (starting from 5.0) ====================================================================== Sending data result rows are sent as soon as they are found in the nested-join loop, row by row class select_result: select_send select_export SELECT ... INTO OUTFILE select_insert INSERT ... SELECT select_exists_subselect multi_delete ... also it could do aggregation for GROUP BY or write rows to temporary table, instead of sending them out Temporary tables if optimizer decides it needs a temporary table, the nested-join loop populates this table instead of sending the data to the client then the second run is performed fetching the data from temporary table. temporary table can be used for aggregation (GROUP BY) or uniqueness (DISTINCT, UNION) - it has unique index created as necessary. You can use the TEMPORARY keyword when creating a table. A TEMPORARY table is visible only to the current connection, and is dropped automatically when the connection is closed. This means that two different connections can use the same temporary table name without conflicting with each other or with an existing non-TEMPORARY table of the same name. (The existing table is hidden). Note: CREATE TABLE does not automatically commit the current active transaction if you use the TEMPORARY keyword. ========================================================================= item.h : struct Hybrid_type { longlong integer; double real; ... } ============================================================================ What is JOIN structure? what is TABLE_LIST ? What is COND ? How is join conditions represented in item ? /** Header Files overview LINE 630 */ Primitive types/header files: sql_bitmap.h, sql_array.h, (class Dynamic_array) plugin.h, (forward declare: class THD, Item; Defines struct st_mysql_xid; XID type as per X/Open std. scheduler.h (scheduler functions which attach threads per connection ) mysql_com.h [ common between client/server. Usable within C and C++ enum enum_server_command { ...,COM_STMT_PREPARE, ... } struct st_vio; (forward declare); struct st_net { ... } NET; enum_field_types { MYSQL_TYPE_DECIMAL,...} #define FIELD_TYPE_DECIMAL MYSQL_TYPE_DECIMAL enum enum_cursor_type { no cursor, read-only, update, scrollable} UDF declarations violite.h virtual IO type. e.g. tcpip, namedpipe, ssl, sharedmem, etc unireg.h Historical use. Designed initially for specific form terminal. MAX constants for fields(4096), time width, etc. openfrm() function paramters. includes structs.h: which defines KEY, KEY_PART_INFO, REGINFO, etc. mysqld_error.h Defines all MySQL Error constant integers sql_string.h class String; copy_and_convert(); etc. sql_list.h Sql_alloc : Std class mem allocator. struct list_node:public Sql_alloc => single linked list List Iterator struct ilink; => Intrusive Linked list. sql_map.h Interface for mem mapped files. my_decimal.h include decimal.h; All declns for decimal support handler.h include my_handler.h, ft_global.h, keycache.h struct xid_t; Index capabilities, different flags; e.g. Requires primarykey (can't create underneath) tablespace related defintions; struct handlerton { uint slot; // thd->ha_data[xxx_hton.slot] provides per connection // memory area. int (*commit)(...); etc. uint32 flags; // global handler flags. void *data; // engine specific data }; class Ha_trx_info { // transaction info // thd->transaction.{stmt|all}.ha_list contains // an instance of this per engine // SE calls trans_register_ha() to put itself // in the thread's thd->transaction's ha_list. } HA_CREATE_INFO structure; class COST_VECT; class ha_statistics; class handler{ struct st_table *table; // table cursor struct st_table_share *table_share; // table definition ulonglong Table_flags; handlerton *ht; // pointer to storage engine ha_statistics stats; const Item *pushed_cond; Item *pushed_idx_cond; uint pushded_idx_cond_keyno; ulonglong next_insert_id; // next val to be inserted int ha_open(...); int ha_external_lock(THD *thd, int lock_type); int ha_write_row(uchar *buf); .... virtual COND* cond_push(COND *cond) { return cond; } ... ha_rows records() { return stats.records; } virtual int rnd_next(uchar *buf); virtual int info(uint); // see my_base.h // info(HA_STATUS_CONST); updates // handler::table->s->keys_in_use, etc void set_next_insert_id(ulonglong id) { next_insert_id=id; } virtual void COND* cond_push(COND *cond){return cond;} virtual int lock_table(THD *thd, int lock_type, int timeout); void **ha_data(THD*); THD* ha_thd(void); } table.h : struct st_table_share { ... table definition ... /* The following is copied to each TABLE on OPEN */ Field **field; Field **found_next_number_field; Field *timestamp_field; /* Used only during open */ KEY *key_info; /* data of keys in database */ void *ha_data; // storage engine data } struct st_table { ... TABLE_SHARE *s; handler *file; THD *in_use; Field **field; KEY *key_info; // data of keys in database int current_lock; // type of lock on table .... } struct st_field_info { ... } typedef class Item COND; class Natural_join_column; struct TABLE_LIST { // Table reference in FROM clause. st_select_lex select_lex; Item *where; TABLE_LIST *next_leaf; LEX_STRING select_stmt; TABLE *table; uint table_id; // table id from binlog char *alias; Item *on_expr; List *join_using_fields; } structs.h: typedef struct st_key { uint key_length; /* Tot length of key */ ulong flags; /* dupp key and pack flags */ uint key_parts; /* How many key_parts */ uint extra_length; uint usable_key_parts; /* Should normally be = key_parts */ uint block_size; enum ha_key_alg algorithm; /* Note that parser is used when the table is opened for use, and parser_name is used when the table is being created. */ union { plugin_ref parser; /* Fulltext [pre]parser */ LEX_STRING *parser_name; /* Fulltext [pre]parser name */ }; KEY_PART_INFO *key_part; char *name; /* Name of key */ /* Array of AVG(#records with the same field value) for 1st ... Nth key part. 0 means 'not known'. For temporary heap tables this member is NULL. */ ulong *rec_per_key; union { int bdb_return_if_eq; } handler; struct st_table *table; LEX_STRING comment; } KEY; typedef struct st_key_part_info { /* Info about a key part */ Field *field; uint offset; /* offset in record (from 0) */ uint null_offset; /* Offset to null_bit in record */ /* Length of key part in bytes, excluding NULL flag and length bytes */ uint16 length; /* Number of bytes required to store the keypart value. This may be different from the "length" field as it also counts - possible NULL-flag byte (see HA_KEY_NULL_LENGTH) [if null_bit != 0, the first byte stored at offset is 1 if null, 0 if non-null; the actual value is stored from offset+1]. - possible HA_KEY_BLOB_LENGTH bytes needed to store actual value length. */ uint16 store_length; uint16 key_type; uint16 fieldnr; /* Fieldnum in UNIREG (1,2,3,...) */ uint16 key_part_flag; /* 0 or HA_REVERSE_SORT */ uint8 type; uint8 null_bit; /* Position to null_bit */ } KEY_PART_INFO ; Note: difference between table and TABLE_LIST ? The second one contains "where" condition and select_lex etc. Contains TABLE * too. Contains all info related to FROM clause. field.h : class Field, Field_num, Field_decimal, etc. item.h : class Item, Item_num, Item_case_expr, Item_sum, Item_outer_ref, sql_class.h : Alter_column, Alter_drop, Key, Foreign_key, MYSQL_LOCK, struct Query_cache_tls,//thread local data//, struct system_variables, class Statement, class THD : public Statement, sql_cache.h : SQL query cache related procedure.h : /* when using sql procedures */ sql_select.h : Classes to use while using where clause: class JOIN; ========================= End of Header Files Overview ================================================= Insert statement flow : (gdb) where #0 mysql_parse (thd=0xa35bff8, inBuf=0xa3b2418 "insert into t values(1)", length=23, found_semicolon=0xa6efe280) at sql_parse.cc:5763 #1 0x0830dadb in dispatch_command (command=COM_QUERY, thd=0xa35bff8, packet=0xa3aa3b9 "insert into t values(1)", packet_length=23) at sql_parse.cc:1051 #2 0x0830edad in do_command (thd=0xa35bff8) at sql_parse.cc:724 #3 0x082fb5a7 in handle_one_connection (arg=0xa35bff8) at sql_connect.cc:1153 #4 0x0081443b in start_thread () from /lib/libpthread.so.0 #5 0x0076bfde in clone () from /lib/libc.so.6 /** mysql_parse() explained. LINE 845 */ mysql_parse(){ lex_start(thd){ initialize thd->lex of type LEX (struct st_lex) what does LEX contain ? Everything about statement. struct st_lex : public Query_tables_list { SELECT_LEX select_lex; SELECT_LEX current_select; SELECT_LEX all_selects_list; SELECT_LEX_UNIT unit; Item *default_value, *on_update_value; XID *xid; THD *thd; // From Query_tables_list TABLE_LIST *query_tables; SQL_LIST sroutines_list; // To track tables used by stored funcs/triggers } What is SELECT_LEX ? info about where condition, from tables, result table, temp table, pointers to subselects, groupby, orderby class st_select_lex : public st_select_lex_node { Item *where, *having; SQL_LIST table_list; SQL_LIST group_list; // group by class SQL_LIST order_list; // group by class List item_list; // List of fields and expressions List top_join_list; // join list of top level JOIN *join; // after JOIN::prepare, use this only .... } What is SQL_LIST ? non-template, simple linked list. Why not List ? } mysql_reset_thd_for_next_command(thd){ clear thd->server_status; thd->sent_row_count= thd->examined_row_count= 0; } query_cache.send_result_to_client(thd, (char*) inBuf, length){ if query found in cache, net_real_write(&thd->net, result->data(), ...); return } Lex_input_stream lip(thd, inBuf, length); // basic lex input setup parse_sql(thd, &lip, NULL){ invokes yyparse i.e. MYSQLparse() from sql_yacc.yy sets up lex structure. } check_mqh() { enforce per-user limit of queries in last 1 hour } lex->safe_to_cache_query= 0; thd->server_status|= SERVER_MORE_RESULTS_EXISTS; lex->set_trg_event_type_for_tables(); mysql_execute_command(thd) { check_table_access(thd, privileges_requested, all_tables, ...); execute_sqlcom_select(thd, all_tables) { open_and_lock_tables(thd, all_tables, derived_also=TRUE, flags=0){ open_table(thd, TABLE_LIST tables, ...){ // The TABLE_LIST elements indicate things like "create" table, etc. // If creating new table, just a place holder returned with // exclusive name lock. // How the table is locked for read-only or read-write ? // // Also open table cache is reused if possible. // // Interesting Variables: // table->query_id, thd->query_id = kind of statement id ?? // table_list->lock_type = TL_WRITE_ALLOW_WRITE etc. // table->reginfo.lock_type // // thd->handler_tables = All currently open tables (TABLE*) // thd->open_tables = what is the diff? (TABLE*) // thd_handler_tables_hash = HASH of TABLE_LIST contains the // closed tables cache ??? // not affected by flush tables command. // HASH open_cache; // sql_base.cc server wide open tab cache // static HASH table_def_cache; // sql_base.cc server wide cache // thd->locked_tables (type MYSQL_LOCK*) // // what is the use of LOCK_open global lock; - Used for checking if the table is in open cache etc. complex logic. Explain. } // Note: table should be open to be locked. lock_tables(thd, tables, counter, &need_reopen); close_tables_for_reopen(thd, &tables); } result= new select_send(); // just construct data structure query_cache_store_query(thd, all_tables); handle_select(thd, lex, result, 0){ // The first select_lex in lex has special meaning for non-select also. SELECT_LEX *select_lex = &lex->select_lex; mysql_select(...){ join= new JOIN(thd, fields, select_options, result))) join->prepare(...);{ setup_tables_and_check_access(); // basic setup } join->flatten_subqueries(); join->optimize(); join->exec() { do_select()=> sub_select(){ join_init_read_record(JOIN_TAB *tab){ records.cc:init_read_record() { info->read_record = rr_sequential; table->file->ha_rnd_init(1); // advice SE to cache the scan results table->file->extra_opt(HA_EXTRA_CACHE, thd->variables.read_buff_size); table->file->cond_push(select->cond); call rr_sequential() evaluate_join_record(join, join_tab, error){ join->result->send_data(*join->fields); } } } } } } } } } } // end of mysql_parse() ==================================================================================== Generating Cross Reference: Generating Browsable Binary Log Information Source files in the sql directory of a MySQL source tree or source distribution contain comments that can be processed with doxygen to generate HTML files that describe classes, files, and so forth. Those files can be viewed in your Web browser. To generate the HTML files and view information about the classes related to binary logging, do this: 1. In your MySQL source tree, change location to the sql directory: shell> cd sql 2. Run doxygen to generate the HTML files. These commands create a subdirectory named html containing the HTML output: shell> doxygen -g shell> doxygen 3. To view the top-level index page, load the html/index.html file into your browser. 4. To view the classes for binary logging, load the html/class_log__event.html page. The Log_event class is the main event class, and the others inherit from Log_event. The pages also contain links that take you to other related pages. For example, to navigate from index.html to class_log__event.html, click on the Classes tab. On the next page, scroll down to Log_event and click on it. ==================================================================================== FAQ: Global Caches: - open_cache === Global list of open tables TABLE *. see sql_base.cc sql_base.cc : local cache: table_def_cache (contains TABLE_SHARE *) open_table() Closer look : TABLE *open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root, bool *refresh, uint flags) { // Initially HASH open_cache contains list of TABLE* including mysql:db, // :host, :columns_priv etc tables. (where are these opened ?) // During flush tables; All these get flushed. } =================================================================================== /** Notes on locking LINE 1030 */ Locking : (from lock.cc) Locking functions for mysql. Because of the new concurrent inserts, we must first get external locks before getting internal locks. If we do it in the other order, the status information is not up to date when called from the lock handler. GENERAL DESCRIPTION OF LOCKING When not using LOCK TABLES: - For each SQL statement mysql_lock_tables() is called for all involved tables. - mysql_lock_tables() will call table_handler->external_lock(thd,locktype) for each table. This is followed by a call to thr_multi_lock() for all tables. - When statement is done, we call mysql_unlock_tables(). This will call thr_multi_unlock() followed by table_handler->external_lock(thd, F_UNLCK) for each table. - Note that mysql_unlock_tables() may be called several times as MySQL in some cases can free some tables earlier than others. - The above is true both for normal and temporary tables. - Temporary non transactional tables are never passed to thr_multi_lock() and we never call external_lock(thd, F_UNLOCK) on these. When using LOCK TABLES: - LOCK TABLE will call mysql_lock_tables() for all tables. mysql_lock_tables() will call table_handler->external_lock(thd,locktype) for each table. This is followed by a call to thr_multi_lock() for all tables. - For each statement, we will call table_handler->start_stmt(THD) to inform the table handler that we are using the table. >>> The tables used can only be tables used in LOCK TABLES or a >>> temporary table. - When statement is done, we will call ha_commit_stmt(thd); - When calling UNLOCK TABLES we call mysql_unlock_tables() for all tables used in LOCK TABLES If table_handler->external_lock(thd, locktype) fails, we call table_handler->external_lock(thd, F_UNLCK) for each table that was locked, excluding one that caused failure. That means handler must cleanup itself in case external_lock() fails. ============================================================================== Case Study: Begining of a program: Basic global tables are open in open_cache No thread specific tables are open. mysql> use test; This triggers open table of all tables in database "test" including table "t1" that I had created earlier : (locking is a different step). New table entry is allocated as follows : table=(TABLE*) my_malloc(sizeof(*table),MYF(MY_WME)) /* * table_list contains "test:t1" etc. table is yet not populated!!! */ error= open_unireg_entry(thd, table, table_list, alias, key, key_length, mem_root, (flags & OPEN_VIEW_NO_PARSE)) { TABLE_SHARE *share= get_table_share_with_create(thd, table_list, cache_key,...); open_table_from_share(thd, share, alias,..){ } if (there is problem in getting table share) { // discover table from engine: if needed. ha_create_table_from_engine(THD* thd, const char *db, const char *name); // if share -> ref_count != 1 it returns error! why? // It means as long as some body keeps the table open (no matter what lock // it is), we can not open it ???? Does not make much sense ???? } } MySQL Example Queries to simulate parallel queries : select * from t1 where sleep(30) >= 0 ; /** mysql_install_plugin() explained. LINE 1130 */ Tracing of install/uninstall plugins: mysql_install_plugin(){ TABLE_LIST tables => tables.db="mysql"; tables.table_name="plugin"; // you need insert perm in mysql:plugin table check_table_access(thd, INSERT_ACL, &tables, ...); // Open mysql:plugin table for write ... open_ltable(thd, &tables, TL_WRITE, 0); Lock &Lock_plugin mutex ; plugin_add(thd->mem_root, name, ...){ // If dll is already loaded only ref_count is incremented. dlopen innodb.so plugin->ref_count is set to 1. plugin_dl.version = Look up symbol int value of "_mysql_plugin_interface_version" in the .so library. Note: #define MYSQL_PLUGIN_INTERFACE_VERSION 0x0100 "_mysql_plugin_declarations" array contains plugin struct array } plugin_initialize(plugin) { plugin->state = ready ; plugin type initialize i.e. ha_initialize_handlerton(plugin){ handlerton *hton = my_malloc(...); plugin->plugin->init(hton); //e.g. innobase_init(hton); plugin->data = hton; installed_htons[hton->db_type] = hton; } add status variables for this plugin; } // We don't replicate install plugin stmt!!! // Temporarily disable insert into mysql.plugin table bin logging. tmp_disable_binlog(thd); // To directly write(or insert) into the table, you could do: table->field[0]->store(name->str, name->length, syscharset); table->field[1]->store(dl->str, dl->length, files_charset_info); error= table->file->ha_write_row(table->record[0]); reenable_binlog(thd); UnLock &Lock_plugin mutex ; If there was an error { set state as plugin deleted; reap_needed = true; reap_plugins(); } my_ok(thd); // short cut for: thd->main_da.set_ok_status(...); set_thd_proc_info("query end"); // prints a debug msg // proc_info: calling func : info etc. } mysql_uninstall_plugin(thd, lex_string){ /** Note: Internal Plugin struct contains mysql Plugin struct: struct st_plugin_int { name; struct st_mysql_plugin plugin; ... } The status_vars exist only in mysql plugin; The system_vars exist in both internal and mysql plugin structs. */ Look in internal plugins: struct st_plugin_int *plugin = plugin_find_internal(name, MYSQL_ANY_PLUGIN); built-in plugin (i.e. plugin->plugin_dl == NULL ) can't be unloaded. plugin->state = deleted; if (plugin->ref_count > 0) issue warning : "plug is busy. will be uninstalled on shutdown" && return; reap_needed = true; reap_plugins(){ Acquire LOCK_plugin; Mark deleted plugins as "dying" ; Unlock LOCK_plugin; plugin_deinitialize(plugin, true){ remove plugin's status variables ; Note: Array plugin_type_deinitialize[] contains array of plugin unload functions: For type1, it is: Call ha_finalize_handlerton(plugin) { // handlerton *hton= (handlerton *)plugin->data; installed_htons[hton->db_type]= NULL; hton->panic(hton, HA_PANIC_CLOSE){ // e.g. innobase_end() innobase_shutdown_for_mysql(){ logs_empty_and_mark_files_at_shutdown(){ wait while trx_n_mysql_transactions > 0 ; make check points, etc. } shutdown all threads; } } } plugin->state = uninitialized; } plugin_del(plugin){ Free plugin->system_vars; unload shared libs; } } delete plugin from mysql.plugin table; } Conditional Push : (gdb) where #0 ha_ndbcluster_cond::cond_push (this=0xac3f100, cond=0xac6cc98, table=0xac236f0, ndb_table=0xac5cb18) at ha_ndbcluster_cond.cc:923 #1 0x0848acb4 in ha_ndbcluster::cond_push (this=0xac41b68, cond=0xac6cc98) at ha_ndbcluster.cc:9458 #2 0x08313a70 in make_join_select (join=0xac6ce00, select=0xac6e6d8, cond=0xac6cc98) at sql_select.cc:6120 #3 0x08320b85 in JOIN::optimize (this=0xac6ce00) at sql_select.cc:1036 #4 0x08323ba2 in mysql_select (thd=0xac37900, rref_pointer_array=0xac38d88, tables=0xac6c9b0, wild_num=1, fields=@0xac38d24, conds=0xac6cc98, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147764736, result=0xac6cde8, unit=0xac38a18, select_lex=0xac38c90) at sql_select.cc:2346 #5 0x08329105 in handle_select (thd=0xac37900, lex=0xac389bc, result=0xac6cde8, expain select * from t1; ====================================================== /** Notes on Lex and Yacc LINE 1260 */ Lex and Yacc: Input: target temperature 22 Output: New temperature set! The tokens we need to recognize are: target, temperature, NUMBER. The Lex tokenizer is: %{ #include #include "y.tab.h" %} /* yylval is of type YYSTYPE - you can declare it as int or char * or whatever */ %% [0-9]+ yylval=atoi(yytext);return NUMBER; target return TOKTARGET; temperature return TOKTEMPERATURE; \n /* ignore end of line */; [ \t]+ /* ignore whitespace */; %% Yacc Grammar : %{ #include #include void yyerror(const char *str) { fprintf(stderr,"error: %s\n",str); } int yywrap() { return 1; } main() { yyparse(); } %} %token NUMBER TOKHEAT STATE TOKTARGET TOKTEMPERATURE #define YYSTYPE int /* or char * if it is string */ commands : target_set target_set: TOKTARGET TOKTEMPERATURE NUMBER { printf("\tTemperature set as %d\n", $3); $$ = $3; /* you can later use this */ } ; Note: In yacc, you can also declare the return type of token to be union: %token TOKHEATER TOKHEAT TOKTARGET TOKTEMPERATURE %union { int number; char *string; } %token STATE %token NUMBER %token WORD In lex, you will refer them as: yylval.number = ... ; yylval.string = ...; etc. In yacc, you directly refer $2 etc which is replaced by the right type. MySQL sql_yacc.yy : %{ /* thd is passed as an argument to yyparse(), and subsequently to yylex(). ** The type will be void*, so it must be cast to (THD*) when used. ** Use the YYTHD macro for this. */ #define YYPARSE_PARAM yythd #define YYLEX_PARAM yythd #define YYTHD ((THD *)yythd) #define YYLIP (& YYTHD->m_parser_state->m_lip) #define Lex (YYTHD->lex) #define Select Lex->current_select #include "mysql_priv.h" ... int yylex(void *yylval, void *yythd); .... /* helper functions */ %} %union { int num; ulong ulong_num; ulonglong ulonglong_number; longlong longlong_number; LEX_STRING lex_str; LEX_STRING *lex_str_ptr; LEX_SYMBOL symbol; Table_ident *table; char *simple_string; Item *item; Item_num *item_num; List *item_list; List *string_list; String *string; Key_part_spec *key_part; TABLE_LIST *table_list; udf_func *udf; LEX_USER *lex_user; struct sys_var_with_base variable; st_select_lex *select_lex; ... } /* Following specifies what id's are of which type */ %type IDENT IDENT_QUOTED TEXT_STRING .... %type table_ident table_ident_nodb references xid .... %% Follow describe(explain) actions: explain select * from t; describe : describe_command { Lex->desribe |= DESCRIBE_NORMAL; } select { Lex->select_lex.options |= SELECT_DESCRIBE; } During parsing the structures that have been setup: thd->lex->select_lex // Type SELECT_LEX; first select thd->lex->current_select // Type SELECT_LEX; current parsing Item *item= new (thd->mem_root) Item_field(&thd->lex->current_select->context, NULL, NULL, "*"); thd->lex->current_select->add_item_to_list(thd, item); // i.e. item_list.push_back(item) (thd->lex->current_select->with_wild)++; // For each table in from: list : #define Select thd->lex->current_select // Select is current_select st_select_lex type. Select->add_table_to_list(YYTHD, Table_ident, alias, Select->get_table_join_options(), // what ??? Lex->lock_option, Select->pop_index_hints()) { // sql_parse.cc Allocate TABLE_LIST *; // Add an entry in SELECT_LEX :: table_list for this table. // Note: SQL_LIST table_list; is bit clumsy. used for linked list. Initialize Select->context with from tables for name resolution } // Process where condition : e.g. i>5; new Item_field(Lex->current_context(), NullS, NullS, $1.str); new Item_int($1.str, ...); // Integer constant. e.g. 5 Select->where = Item expression; where_expr_item -> top_level_item(); // useful for optimizing. lex->sql_commad = SQLCOM_SELECT; // Mark first select lex as "describe". // The command remains as "Select command" only lex->select_lex.options |= SELECT_DESCRIBE; mysql_execute_command: execute_sqlcom_select: open_and_lock_tables(thd, all_tables); thd->send_explain_fields(select_result *result=new select_send()); mysql_explain_union(thd,...) : mysql_select(): join= new JOIN(thd, fields, select_options, result))) join->prepare(rref_pointer_array, tables, wild_num, conds, og_num, order, group, having, proc_param, select_lex, unit); setup_wild(...); // process * in the query join->optimize(); Important Definitions Locations: sql_lex.h: typedef class st_select_lex SELECT_LEX; //DECLARE class st_select_lex: public st_select_lex_node { ... } /** NDB pushdown logic explained. LINE 1480 */ // handler function ha_ndbcluster::cond_push(const COND *cond) { m_cond = new ha_ndbcluster_cond; m_cond -> cond_push(cond, table, (NDBTAB*)m_table){ Initialize a context which holds input/output info. item->traverse_cond(&ndb_serialize_cond, context, Item::PREFIX); } } ========================================================================= context holds the following first : Top Level : context.expect(Item::FUNC_ITEM); // > < = etc operators (yields int result even for boolean) context.expect(Item::COND_ITEM); // AND, OR etc. logical operators // Item_cond uses list of operands. Inheritance: Item=>Item_result_field=>Item_func=>Item_int_func=>Item_bool_func=>Item_cond // cond item is a func item (cond item is specialized from func item) // Item_func class used for AND, OR, <, >, etc operators: in item_func.h class Item_func : public Item_result_field { .... enum FuncType { EQ_FUNC, IN_FUNC, COND_OR_FUNC, etc etc } enum Type type() const { return FUNC_ITEM; } .... } class Item_result_field : public Item /* Item with result field */ { Field *result_field; void set_result_field(Field *field) { result_field= field; } .... } // For logical and, or etc operators, Item_cond is used. class Item_cond: public Item_bool_func (: public Item_int_func : public Item_func ) { ... enum Type type()const { return COND_ITEM; } List* argument_list() { return &list; } .... } Note: IN function and Between function is rewritten using AND/OR conditions IN types allowed: int, string, varbin only. For condition select * from t1 where a > 5 and b > 3 : Sequence: Top level tree item: Step 1: Expect cond_item or func_item [AND - cond item] Push this into NDB condition stack Step 2: Process ">" in [ a > 5 ] Create new ndb condition for ">"; Set expect context = {supported Item Type:- string,real,decimal,int,field,varbin;} {supported FieldResultType:- String, real, int, decimal} field_type and result type may be different due to default type promotion like varchar to char etc. Step 3: Process "a" Create New ndb condition -- because we are serializing condition in list. item->type() is Item::FIELD_ITEM; field_item->field->type() is MYSQL_TYPE_LONG (int is mapped to this type) check (context->table->s == field->table->s) // filter for single table only fieldtype BLOB types (MYSQL_TYPE_*_BLOB) (TINY/MEDIUM/LONG) are not supported. Update ndb condition (in the serialized list) with field and NDB column number for the field. Reset the context to not to expect Field Item now. (Atmost one field can be used with in one boolean expression component. a<5 and b<3 allowed, but adont_expect(Item::FIELD_ITEM); context->expect_no_field_result(); ] Set context := {expected other parameter type: INT_ITEM or VARBIN_ITEM} step 4: Process "5" item->type() is Item::INT_ITEM Check with context if "field" has been processed already. [ yes. In case, field has not been seen yet, we would set context to expect only field next. ] Finished processing both Field item and Int Item. End of one condition. Now set the context to expect := { and/or/in/between conditions, <, >, /, = } Step 5: Process ">" in b>3 Step 6: Process "b" Step 7: Process "3" step 8: At the end of AND/OR call traverse function with item=NULL (as a marker) This is to put "end-of-condition" marker in the serialized ndb condition list. ========================================================================= Note: Following conditions are supported: a in (1 , 2, 3) ; rewrite using OR a not in (1, 2, 3) ; rewrite a between 1 AND 5 ; rewrite using AND a in (5, 4+6, 3) ; supported ! In case of IN function, rewrite the function using OR. In case of BETWEEN function rewrite the function using AND ========================================================================= Note: Only Item_cond on prefix traversal, calls (*traverser)(NULL, arg) at the end. Other traverse functions don't. (!!!) Note: Single node items (like field item, etc) uses Item's trivial traverse_cond() virtual function supplied which just visits "this" node. Note: For filter expression: (a LIKE '%this string%' ) Expect only STRING_ITEM|FIELD_ITEM with result type of: var_string|varchar with string result. ========================================================================= MySQL Field Types: mysql_com.h : num enum_field_types { MYSQL_TYPE_DECIMAL, MYSQL_TYPE_TINY, MYSQL_TYPE_SHORT, MYSQL_TYPE_LONG, MYSQL_TYPE_FLOAT, MYSQL_TYPE_DOUBLE, MYSQL_TYPE_NULL, MYSQL_TYPE_TIMESTAMP, MYSQL_TYPE_LONGLONG,MYSQL_TYPE_INT24, MYSQL_TYPE_DATE, MYSQL_TYPE_TIME, MYSQL_TYPE_DATETIME, MYSQL_TYPE_YEAR, MYSQL_TYPE_NEWDATE, MYSQL_TYPE_VARCHAR, MYSQL_TYPE_BIT, MYSQL_TYPE_NEWDECIMAL=246, MYSQL_TYPE_ENUM=247, MYSQL_TYPE_SET=248, MYSQL_TYPE_TINY_BLOB=249, MYSQL_TYPE_MEDIUM_BLOB=250, MYSQL_TYPE_LONG_BLOB=251, MYSQL_TYPE_BLOB=252, MYSQL_TYPE_VAR_STRING=253, MYSQL_TYPE_STRING=254, MYSQL_TYPE_GEOMETRY=255 } Item result types: enum Item_result {STRING_RESULT=0, REAL_RESULT, INT_RESULT, ROW_RESULT, DECIMAL_RESULT}; enum Functype { UNKNOWN_FUNC,EQ_FUNC,EQUAL_FUNC,NE_FUNC,LT_FUNC,LE_FUNC, GE_FUNC,GT_FUNC,FT_FUNC, LIKE_FUNC,ISNULL_FUNC,ISNOTNULL_FUNC, COND_AND_FUNC, COND_OR_FUNC, COND_XOR_FUNC, BETWEEN, IN_FUNC, MULT_EQUAL_FUNC, INTERVAL_FUNC, ISNOTNULLTEST_FUNC, SP_EQUALS_FUNC, SP_DISJOINT_FUNC,SP_INTERSECTS_FUNC, SP_TOUCHES_FUNC,SP_CROSSES_FUNC,SP_WITHIN_FUNC, SP_CONTAINS_FUNC,SP_OVERLAPS_FUNC, SP_STARTPOINT,SP_ENDPOINT,SP_EXTERIORRING, SP_POINTN,SP_GEOMETRYN,SP_INTERIORRINGN, NOT_FUNC, NOT_ALL_FUNC, NOW_FUNC, TRIG_COND_FUNC, SUSERVAR_FUNC, GUSERVAR_FUNC, COLLATE_FUNC, EXTRACT_FUNC, CHAR_TYPECAST_FUNC, FUNC_SP, UDF_FUNC, NEG_FUNC }; == Storage Engine API Extensions For Query Fragment Pushdown == This document summarizes the ideas and plans for MySQL storage engine API extensions primarily focused on supporting query fragment pushdown. === Version History ===

Date         Author   Comments
04-Sep-2009  Thava    Initial version after discussions with Mikael Ronstrom,
                      Timour and Storage Engine API Team.

=== Background ===

In many instances, it is much more efficient to push parts of query conditions to storage engines rather than doing it in server. Such conditions could be either trivial scan filters (e.g: "a = 30") or more complex (e.g. join conditions on different tables). Pushing down the constant scan filter was added recently for cluster storage engine. However, pushing down more complex conditions is required and not yet supported. This requirement is especially critical for cluster storage engine. Without this, all entire tables involved in the query are fetched to the server from different nodes before applying "joins", for example. This could be eliminated by pushing such conditions to storage engines to limit the amount of data shipped across nodes.

Current storage engine API exposes too many internal details, hence too fragile if anything changes internally. We need API to expose the data structure through abstract and stable interfaces. The new API should be very well documented so that it would be easier to understand and use.

=== Goals === The primary goals are mentioned below : # Enable query fragment pushdown(QFPD) to storage engines. The immediate next step should support pushing down join conditions on tables belonging to same storage engine. More complex conditions would follow later. However the design would be easily extensible in future. The API Extensions would make use of easy-to-use and clearly documented interfaces. New AQT (Abstract Query Tree) data structure will be created that will be used as parameters for new API extensions. # Extend the AQT interfaces that could be potentially be used by both server and/or storage engines. Such use is not only limited to QFPD. The benefit would be-- easier to use APIs across different server modules and storage engines. (This is long term goal). === Abstract Query Tree ===

The Abstract Query Tree is the core theme which enables easy-to-use and stable interface between server and storage engine. As a first step, this will be used to pass query fragments to storage engine for pushdown. In longer term, this is envisioned to be extensible and usable within server itself whenever appropriate.

=== Worklogs Reference === * See worklogs: ** [https://intranet.mysql.com/worklog/?tid=4292 WL#4292 Pushdown of query fragments to storage engines] *** [https://intranet.mysql.com/worklog/?tid=4533 WL#4533 QFPD - Abstract Query tree (AQT) ] *** [https://intranet.mysql.com/worklog/?tid=4535 WL#4535 QFPD: Storage Engine Pushdown API (SEPD API)] * See Also: ** [https://intranet.mysql.com/worklog/?tid=4536 WL#4536 QFPD: Query decomposition and optimization for query fragments] ** [https://intranet.mysql.com/worklog/?tid=4537 WL#4537 QFPD: Virtual Tables] ** [https://intranet.mysql.com/worklog/?tid=3288 WL#3288 Foreign Keys: Storage engine API for foreign key support] === Main Tasks === The main tasks involved are : * Clearly document the existing storage engine API: Both in a separate document and in code. * Define AQT data structure for expressions to pushdown * Define new storage engine API extensions which make use of AQT for pushdown of join conditions * Clearly document new API extensions [[Category:Specifications]] [[Category:MySQLDevelopment]] === WL#4535 scratchpad === ==== Example applications of AQT designs to imaginary and real storage engines ==== ===== Engine capabilities ===== * Engine 1: ** support only 2-way joins ** all conditions are atomic ** no aggregation, group by, order by * Engine 2: ** support N-way joins ** all conditions are atomic ** no aggregation, group by, order by ===== Application of AQT designs to the engines above ===== * Multiple iterators approach (by Thava) ** Engine 1:
my_se::pushdown_analysys(aqt *qf)
{
  aqt_cond cur_join_cond = qf->get_join_cond();
  if (cur_join_cond is not an atomic condition)
    qf->mark_non_pd()
    return;

  aqt_abstract_table *cur_join_operand;
  cur_join_operand = qf->get_left();
  if (cur_join_operand->type() != AQT_BASE_TABLE)
    qf->mark_non_pd()
    return;

  cur_join_operand = qf->get_right();
  if (cur_join_operand is not a base table)
    qf->mark_non_pd()
    return;
}
** Engine 2:

my_se::pushdown_analysys(aqt *qf)
{
  aqt_cond cur_join_cond = qf->get_join_cond();
  if (cur_join_cond is not an atomic condition)
    qf->mark_non_pd()
    return;
.................

}
* Single abstract iterator (Timour) TODO === How to expose Table as abstract Table (Serg's Draft) ===

struct TABLE
{
  TABLE_SHARE    *s;
  TABLE *share_next, **share_prev;
};

class Public_Table : private TABLE {
    char *get_name() { return s->table_name.str; }
    Public_Table *next_in_list() { return (Public_Table *)next; }
    bool GIS_Is_Supported () { return s->db_type->flags & HA_CAN_GEOMETRY; }
    Public_Table *next_InnoDB_table() {
      TABLE *n;
      for (n=next; n; n=n->next)
        if (n->s->db_type == innobase_hton_ptr)
          return n;
      return NULL;
    }
    void mark_as_pushdownable();
}

//*+++++++****************************************+

  ...
  TABLE *table;
  ...
  handler->AQT_push_down((Public_Table*)table);


Item_func()....


Public_Item_func() {
  next_child(iterator_state *state) {
    if (state->current == 0)
      return arg0;
    return arguments[state->current++ - 1];
  }
}

=== List of Objects and functionalities that need abstraction ===

------------------------------------------------------------------------------

 AQT Object         :  Query
 Current Equivalent :   Statement (This is parent of THD; Contains parser state of query i.e. LEX (thd->lex) 
 Proposed Class Name: AQTStmt 
 Comments:  This represents entire query after parsing. This should be usable for pushdown into SE or for any other purpose of examining the Query. 

List Of AQT Methods needed :   (TBD)

  Class  AqtStmt : private  Statement {

     public:

        // Define public interface for statement

      
  }

------------------------------------------------------------------------------

AQT Object    :  (Conditional) Expression
Current Equivalent: COND (i.e. Item)
Proposed Classname:  AqtCond 
Current Equivalent:  Item

class AqtCond : private  Item {
  public :  //
}

------------------------------------------------------------------------------

=== Class Hierarchies of important classes ===
   
  Item Class Hierarchy :

http://forge.mysql.com/w/images/1/16/ClassItem_inherit_graph.png === NDB Current pushdown implementation ===

We describe NDB implementation since this will be a good example 
input for AQT implementation. 

Current NDB pushdown is supported only for single table used as column
filters. There are certain restrictions on the type of filters.

Examples of supported pushdown where conditions:
 
     a > 2 and b > 3;      // i.e. select * from t where a>2 and b>3;
     a > 2 and b > 3 and c > 4;   // composite logical and/or OK
     a > 2  or b > 3;
     a > 2+3 and b < mod(10, 8);  // constant functions are OK
     str like '%hello%'  ;        
     mydate > '2008-02-25' ;   // date can be compared with const date string

     a in (2, 3, 4);     // rule is rewritten using OR
     a in (2, 3+5, 10);  // constant expressions are OK
     a  between  5 and 10;  // rule rewritten using AND

Column Types Supported : string,real,decimal,int, varbin, 
                         date, time, datetime, year

Other column types such as blob types are not supported.
Field Result Types Supported : String, real, int, decimal

============================================================================
 
The handler infrastructure for this push down condition is provided by handler
class COND *cond_push(const COND *cond) function which is implemented by NDB
engine. The COND structure is same as Item structure. 
   
The handler function ha_ndbcluster::cond_push(COND *cond) which implements the
current pushdown method is fairly complicated (850+ lines of code) though the
problem specification may sound simple. Here is a simplified *pseudo* code of
current NDB implementation (omitting much of lower level details) :

   // Return NULL on successful push else return rejected condition
   COND * ha_ndbcluster::cond_push(COND *cond)
   {
      context = Initialize a dynamic context object which specifies
                things like 
                (1) what Item Types are expected e.g. COND_ITEM, FUNC_ITEM
                (2) what result types are expected e.g. INT_RESULT,REAL_RESULT
                etc.
      item->traverse_cond(&ndb_serialize_cond, 
                 (void *) &context, Item::PREFIX);

      if (context.supported) return NULL  
      else  return  cond;

   }

  The Item class (and all it's subclasses) provides a traverse_cond() virtual function to traverse it's tree. For example, for Item_ident, this function will be a trivial function to visit it's only node. For Item_func, it will invoke the given function on it's operands in the specified order.

     The ndb_serialize_cond() function is used to traverse the condition and generate filter condition for NDB in it's own list format. We will focus on the logic of traversing and evaluating of the pushdown condition is supported or not. 

Pseudo code :

Step 0:

   At the top level, boolean expression is expected. 
   context :=  { Expected  Item types = FUNC_ITEM, COND_ITEM }
   Note: FUNC_ITEM used for <, >, >=, <=, =, NOT, IN, BETWEEN
         COND_ITEM used for AND, OR
     Internally Item_cond is a subclass of Item_func. 
     However Item_cond is used only for logical AND/OR operations.

Step 1: Call  ndb_serialize_cond() recursively on PREFIX order on item tree:
      item->traverse_cond(&ndb_serialize_cond, 
                 (void *) &context, Item::PREFIX);

   ndb_serialize_cond(Item *item, Context *ctxt)
   {

     if (  item type or item result type is not one of currently expected 
           types as per context)
                 mark context as "unsupported" and return;

   switch item type {
     case COND_ITEM :  Make sure it is AND/OR nothing else. 
                       Push this AND/OR condition into NDB condition list.
                       ctxt holds same item types to expect next:
                                {Expect: FUNC_ITEM, COND_ITEM type nodes} 
                       break;

     case FUNC_ITEM :  /* It is one of <, >, =, IN, BETWEEN, NOT etc */

                       [If the operator is one of IN, BETWEEN, 
                       equivalent AND/OR conditions are pushed into NDB list.
                       The context holds enough information to remember
                       the states across calls. Further details are omitted
                       in the interest of brevity regarding IN, BETWEEN ops.]

                       Push this condition node into NDB list.
                       Set Context to expect the type of operands next:
                       {ItemType:string,real,decimal,int,field,varbin;} 
                       {FieldResultType:- String, real, int, decimal}
                       break;

     case FIELD_ITEM : assert field->type() is one of supported types.
                       Note: blob types are not supported here.
                       Push this node into NDB list including column num info.
                       Mark Context to expect:
                             {No more Field Items}  // atmost 1 field per comp
                             {Expect compatible item types}
                       e.g. If field->type() is MYSQL_TYPE_LONG, 
                            then expect INT_ITEM
      
     case  INT_ITEM  :  If (no field item has been processed  before)
                        { set context to expect: {Only Field Item  next}}
                        else { /* both field and INT item processed now  
                                  Reset context to accept new conditions.
                                  e.g. a < 5 and b > 3 */
                           Context := {Expect FUNC_ITEM or COND_ITEM types}
                        }  
                        push this item into NDB serialized list.               

    case Item::DECIMAL_ITEM:  /* Similar */
    case Item::VARBIN_ITEM:   /* Similar */   
    case Item::REAL_ITEM:     /* Similar */
    case Item::STRING_ITEM:   /* Similar */

   }



====================================================================================

/** libmysqld.a embedded library explained LINE 1990 */
libmysqld.a embedded library :

Function             When to Call
mysql_library_init()     Should be called first. Pass --defaults-file, etc here.
mysql_library_end()     Should be called before your program exits.
mysql_thread_init()     Should be called in each thread you create that accesses MySQL.
mysql_thread_end()     Should be called before calling pthread_exit()
    

Example test code:
    gcc mysql_test.c -o mysql_test -lz \
    `/usr/local/mysql/bin/mysql_config --include --libmysqld-libs`

Note:  Separate [client] and [server] options must exist in my.cnf

====================================================================================


/** Optimization, JOIN::prepare(), optimize(), etc closer look. LINE 2010 */

Investigate optimization layer functions for:
  select * from t1, t2 where t1.id = t2.id;

mysql_parse()=>mysql_execute_command()=>
      >mi_extra ; enter: function: 45 -- called few times ???
     handle_select()=>mysql_select()=>

JOIN::prepare(){
  // join structure is initialized with "conds", order, group, having info.
  // select_lex->join = this;
  // tables_list = initialized to TABLE_LIST* ;
  // join_list = &select_lex->top_join_list; initialized to List *

  Note: some info taken from select_lex; some info initialized in
        select_lex also!!!  JOIN contains select_lex !
        Much of it's logic exists here in setting up select_lex !

   setup_tables(); // select_lex->leaf_tables are initialized here only!
   setup_wilds();
   setup_fields();
   setup_without_groups();
}
JOIN::flatten_subqueries();
JOIN::optimize(){

    SELECT_LEX *sel = thd->lex->current_select; // sel is fixed even here!!!
    // This sel is same as JOIN::select_lex variable in simple cases.
   
    simplify_joins(); // convert outer joins to inner joins if possible, etc
            // This fixes join_list variable (of type List*)

    optimize_cond(){
       WHERE Equal condition gets translated to multiple-equal(t1.i, t2.i);
    }

    make_join_statistics(){
        JOIN_TAB  *stat, **stat_ref;
        JOIN_TAB  *stat_vector[MAX_TABLES+1];

        join->best_ref = stat_vector; // Array of join tabs

        For every table s in TABLE_LIST, initialize JOIN_TAB and put them in array

// Now you have one QEP though it may be the worst one. If you do a cross join of
// all of them and apply where condition, you got the result.
// Now we will reorder them and attach conditions.
        

        pull_out_semijoin_tables();
        choose_plan(){
           greedy_search(){
              best_extension_by_limited_search(){
                 best_access_path()
              }
           }
        }
    } // end of make_join_statistics();

    get_best_combination();
    make_select();
    make_outerjoin_info();
    make_join_select();
    remove_const();
    TEST_join();
} // end of JOIN::optimize()


================================================================

Pseudo code for greedy_search()

   All intermediate and final results of the procedure are stored in 'join':
    - join->positions     : modified for every partial QEP that is explored
    - join->best_positions: modified for the current best complete QEP
    - join->best_read     : modified for the current best complete QEP
    - join->best_ref      : might be partially reordered

    The final optimal plan is stored in 'join->best_positions', and its
    corresponding cost in 'join->best_read'.

  @note
    The following pseudocode describes the algorithm of 'greedy_search':

    @code
    procedure greedy_search
    input: remaining_tables
    output: pplan;
    {
      pplan = <>;
      do {
        (t, a) = best_extension(pplan, remaining_tables);
        pplan = concat(pplan, (t, a));
        remaining_tables = remaining_tables - t;
      } while (remaining_tables != {})
      return pplan;
    }
================================================================

    procedure best_extension_by_limited_search()

/**
  Find a good, possibly optimal, query execution plan (QEP) by a possibly
  exhaustive search.

    The procedure searches for the optimal ordering of the query tables in set
    'remaining_tables' of size N, and the corresponding optimal access paths to
    each table. The choice of a table order and an access path for each table
    constitutes a query execution plan (QEP) that fully specifies how to
    execute the query.

    The maximal size of the found plan is controlled by the parameter
    'search_depth'. When search_depth == N, the resulting plan is complete and
    can be used directly as a QEP. If search_depth < N, the found plan consists
    of only some of the query tables. Such "partial" optimal plans are useful
    only as input to query optimization procedures, and cannot be used directly
    to execute a query.

    The algorithm begins with an empty partial plan stored in 'join->positions'
    and a set of N tables - 'remaining_tables'. Each step of the algorithm
    evaluates the cost of the partial plan extended by all access plans for
    each of the relations in 'remaining_tables', expands the current partial
    plan with the access plan that results in lowest cost of the expanded
    partial plan, and removes the corresponding relation from
    'remaining_tables'. The algorithm continues until it either constructs a
    complete optimal plan, or constructs an optimal plartial plan with size =
    search_depth.
    The final optimal plan is stored in 'join->best_positions'. The
    corresponding cost of the optimal plan is in 'join->best_read'.

  @note
    The procedure uses a recursive depth-first search where the depth of the
    recursion (and thus the exhaustiveness of the search) is controlled by the
    parameter 'search_depth'.

  @note
    The pseudocode below describes the algorithm of
    'best_extension_by_limited_search'. The worst-case complexity of this
    algorithm is O(N*N^search_depth/search_depth). When serch_depth >= N, then
    the complexity of greedy_search is O(N!).

    @code
    procedure best_extension_by_limited_search(
      pplan in,             // in, partial plan of tables-joined-so-far
      pplan_cost,           // in, cost of pplan
      remaining_tables,     // in, set of tables not referenced in pplan
      best_plan_so_far,     // in/out, best plan found so far
      best_plan_so_far_cost,// in/out, cost of best_plan_so_far
      search_depth)         // in, maximum size of the plans being considered
    {
      for each table T from remaining_tables
      {
        // Calculate the cost of using table T as above
        cost = complex-series-of-calculations;

        // Add the cost to the cost so far.
        pplan_cost+= cost;

        if (pplan_cost >= best_plan_so_far_cost)
          // pplan_cost already too great, stop search
          continue;

        pplan= expand pplan by best_access_method;
        remaining_tables= remaining_tables - table T;
        if (remaining_tables is not an empty set
            and
            search_depth > 1)
        {
          best_extension_by_limited_search(pplan, pplan_cost,
                                           remaining_tables,
                                           best_plan_so_far,
                                           best_plan_so_far_cost,
                                           search_depth - 1);
        }
        else
        {
          best_plan_so_far_cost= pplan_cost;
          best_plan_so_far= pplan;
        }
      }
    }
    @endcode

  @note
    When 'best_extension_by_limited_search' is called for the first time,
    'join->best_read' must be set to the largest possible value (e.g. DBL_MAX).
    The actual implementation provides a way to optionally use pruning
    heuristic (controlled by the parameter 'prune_level') to reduce the search
    space by skipping some partial plans.

  @note
    The parameter 'search_depth' provides control over the recursion
    depth, and thus the size of the resulting optimal plan.

  @param join             pointer to the structure providing all context info
                          for the query
  @param remaining_tables set of tables not included into the partial plan yet
  @param idx              length of the partial QEP in 'join->positions';
                          since a depth-first search is used, also corresponds
                          to the current depth of the search tree;
                          also an index in the array 'join->best_ref';
  @param record_count     estimate for the number of records returned by the
                          best partial plan
  @param read_time        the cost of the best partial plan
  @param search_depth     maximum depth of the recursion and thus size of the
                          found optimal plan
                          (0 < search_depth <= join->tables+1).
  @param prune_level      pruning heuristics that should be applied during
                          optimization
                          (values: 0 = EXHAUSTIVE, 1 = PRUNE_BY_TIME_OR_ROWS)

================================================================

/**
  Find the best access path for an extension of a partial execution
  plan and add this path to the plan.

  The function finds the best access path to table 's' from the passed
  partial plan where an access path is the general term for any means to
  access the data in 's'. An access path may use either an index or a scan,
  whichever is cheaper. The input partial plan is passed via the array
  'join->positions' of length 'idx'. The chosen access method for 's' and its
  cost are stored in 'join->positions[idx]'.

  @param join             pointer to the structure providing all context info
                          for the query
  @param s                the table to be joined by the function
  @param thd              thread for the connection that submitted the query
  @param remaining_tables set of tables not included into the partial plan yet
  @param idx              the length of the partial plan
  @param record_count     estimate for the number of records returned by the
                          partial plan
  @param read_time        the cost of the partial plan

  @return
    None
*/

static void
best_access_path(JOIN      *join,
                 JOIN_TAB  *s,
                 THD       *thd,
                 table_map remaining_tables,
                 uint      idx,
                 double    record_count,
                 double    read_time)
{
....

     /* Update the cost information for the current partial plan */
     join->positions[idx].records_read= records;
     join->positions[idx].read_time=    best;
     join->positions[idx].key=          best_key;
     join->positions[idx].table=        s;
     join->positions[idx].ref_depend_map= best_ref_depends_map;
     join->positions[idx].use_insideout_scan= best_is_sj_inside_out;
....
}

================================================================


                join->prepare(...);{
                   setup_tables_and_check_access(); // basic setup
                }
                join->flatten_subqueries();
                join->optimize();
                join->exec() {
                    do_select()=> sub_select(){
                       join_init_read_record(JOIN_TAB *tab){

class JOIN :public Sql_alloc
{
  JOIN_TAB *join_tab,**best_ref;
  JOIN_TAB **map2table;    ///< mapping between table indexes and JOIN_TABs
  TABLE    **all_tables;
  uint     tables;        /**< Number of tables in the join */
  uint     outer_tables;  /**< Number of tables that are not inside semijoin */
  uint     const_tables;
  bool     full_join, group, ... ;
  table_map const_table_map,found_const_table_map,outer_join;
  ha_rows  send_records,found_records,examined_rows,row_limit, select_limit;
  // Note: JOIN maintains the state of execution ??!!
  POSITION positions[MAX_TABLES+1],best_positions[MAX_TABLES+1];
  double   best_read;
  List *fields;
  List group_fields, group_fields_cache;
  TABLE    *tmp_table;
 /// used to store 2 possible tmp table of SELECT
  TABLE    *exec_tmp_table1, *exec_tmp_table2;
  THD      *thd;
  select_result *result;
  ...  
  /// unit structure (with global parameters) for this select
  SELECT_LEX_UNIT *unit;
  /// select that processed
  TABLE_LIST *tables_list;           /// *join_list;       ///< list of joined tables in reverse order
  SQL_SELECT *select;                ///best_read < DBL_MAX)
     //      i.e. atleast one complete plan has been found already.

      BEST_REF: t1(3,3,2) t2(246,246,2)  // table(found_records,records,readtime)
     // This is the list of join tabs involved in the query.

Iteration 2:

SOFAR:; idx: 1  best: DBL_MAX  atime: 1  itime: 3  count: 2.00659
     POSITIONS: t1
     BEST_REF: t1(3,3,2) t2(246,246,2)
part_plan; idx: 1  best: DBL_MAX  atime: 2.00659  itime: 2.00659  count: 3
     POSITIONS: t1
     BEST_REF: t1(3,3,2) t2(246,246,2)


Iteration 3:
full_plan; idx :2  best: 152.146  accumulated: 152.147  increment: 2.00659  count: 738
     POSITIONS: t1 t2
BEST_POSITIONS: t1 t2
      BEST_REF: t1(3,3,2) t2(246,246,2)

===============================================================================
Iteration 4:

pruned_by_heuristic; idx :1  best: 152.146  accumulated: 2.54053  increment: 0  count: 246
     POSITIONS: t2
BEST_POSITIONS: t1
      BEST_REF: t1(3,3,2) t2(246,246,2)

===============================================================================
Greedy search then prints:

optimal; idx :2  best: 152.146  accumulated: 0  increment: 0  count: 1
     POSITIONS: t2 t2
BEST_POSITIONS: t1 t2
      BEST_REF: t1(3,3,2) t2(246,246,2)
===============================================================================

Info about JOIN
t1                type: ALL      q_keys: 0  refs: 0  key: -1  len: 0
t2                type: ALL      q_keys: 0  refs: 0  key: -1  len: 0
===============================================================================


 Determining the Join Type

When evaluating a conditional expression, MySQL decides what join type the expression has. (Again: despite the word join, this applies for all conditional expressions, not just join expressions. A term like access type would be clearer.) These are the documented join types, in order from best to worst:

    * system: a system table which is a constant table
    * const: a constant table
    * eq_ref: a unique or primary index with an equality relation
    * ref: an index with an equality relation, where the index value cannot be NULL
    * ref_or_null: an index with an equality relation, where it is possible for the index value to be NULL
    * range: an index with a relation such as BETWEEN, IN, >=, LIKE, and so on.
    * index: a sequential scan on an index
    * ALL: a sequential scan of the entire table 

See: /sql/sql_select.h, enum join_type{}. Notice that there are a few other (undocumented) join types too, for subqueries.

The optimizer can use the join type to pick a driver expression. For example, consider this query:

SELECT *
FROM Table1
WHERE indexed_column = AND unindexed_column = 6

Since indexed_column has a better join type, it is more likely to be the driver
=======================================================================

A combination of a fixed order in which tables are joined and the corresponding table access methods for each table is called query execution plan (QEP). 


In MySQL, the search for an optimal QEP is performed in a bottom-up manner. The optimizer first considers all plans for one table, then all plans for two tables, and so on, until it builds a complete optimal QEP. 

=======================================================================

 The index Join Type

Consider this query:

SELECT column1 FROM Table1;

If column1 is indexed, then the optimizer may choose to retrieve the values from the index rather than from the table. An index which is used this way is called a covering index in most texts. MySQL simply uses the word index in EXPLAIN descriptions.

=======================================================================

 The Index Merge Join Type

Index Merge is used when table condition can be converted to form:

cond_1 OR cond_2 ... OR cond_N

The conditions for conversion are that each cond_i can be used for a range scan, and no pair (cond_i, cond_j) uses the same index. (If cond_i and cond_j use the same index, then cond_i OR cond_j can be combined into a single range scan and no merging is necessary.)

For example, Index Merge can be used for the following queries:

SELECT * FROM t WHERE key1=c1 OR key2  (tblX.column IS NOT NULL)

The original equality can be checked only after we've read the current rows of both tables tblX and tblY. The IS NOT NULL predicate can be checked after we've read the current row of table tblX. If there are any tables in the join order between tblX and tblY, the added IS NOT NULL check will allow us to skip accessing those tables.

This feature is implemented in these places in the server code:

    * The ref analyzer (contained in such functions as update_ref_and_keys()) detects and marks equalities like that shown above by setting KEY_FIELD::null_rejecting=TRUE.
    * After the join order has been chosen, add_not_null_conds() adds appropriate IS NOT NULL predicates to the conditions of the appropriate tables. 

It is possible to add IS NOT NULL predicates for all equalities that could be used for ref access (and not for those that are actually used). However, this is currently not done. 
=======================================================================

 Partition pruning

The operation of partition pruning is defined as follows:

Given a query over partitioned table, match the table DDL against any WHERE or ON clauses, and find the minimal set of partitions that must be accessed to resolve the query. 

=======================================================================
See here for Join Buffer algorithm :
http://forge.mysql.com/wiki/MySQL_Internals_Algorithms
=======================================================================
Difference between start_stmt() and external_lock() mechanisms to detect txns:

If a user calls LOCK TABLES, MySQL will call handler::external_lock only once. 
In this case, MySQL will call handler::start_stmt() at the beginning of the statement.
For more info, see: http://forge.mysql.com/wiki/MySQL_Internals_Custom_Engine

=======================================================================

Understand  JOIN class:
Look at TEST_join() method in sql_test.cc :

CREATE TABLE `t5` (
  `i` int(11) NOT NULL DEFAULT '0',
  `j` int(11) NOT NULL DEFAULT '0',
  `k` int(11) DEFAULT NULL,
  PRIMARY KEY (`i`,`j`),
  KEY `k` (`k`)
);


Example Query:  select * from t5 where k > 0;

Info about JOIN
 t5                type: ALL      q_keys: 2(keys bitmap)  
                   refs: 0 (key_parts)  key: -1  len: 0  // info about keyparts ??
                   quick select used:
                   quick range select, key k, length: 5

  for (i=0 ; i < join->tables ; i++)
  {
    JOIN_TAB *tab=join->join_tab+i;
    TABLE *form=tab->table;
    char key_map_buff[128];
    fprintf(DBUG_FILE,"%-16.16s  type: %-7s  q_keys: %s  refs: %d  key: %d  len: %d\n",
            form->alias,
            join_type_str[tab->type],  // JT_ALL, JT_EQ_REF, etc. access method?
            tab->keys.print(key_map_buff), // used Keys bitmap
            tab->ref.key_parts,
            tab->ref.key,
            tab->ref.key_length);

form->alias  ==> Table's alias. How does it differ ? Include Examples here.
                 For self join ? Temporary table ?

tab->type   ==> type of condition on which it is joined. It could be JT_EQ_REF etc.
                what about composite condition ? 
                   select * from t1, t2 where t1.a = t2.a or  t1.b < t2.b ?
                Is this range condition join or eq_ref join ? 
                      This is range. eq_ref_or_null is a special case of range, isn't it ?
                      (in which range width is 1)
                The outer-join type is not included here. 
                Is it included as a null-row producing predicate rule ?

tab->keys  ==> This is 64-bits keymap which indicates set of all keys that could be
               used in this join. How are the keys numbered ? Check this.

tab->ref.key_parts ==> ref is TABLE_REF which includes info about keys, 
                  but is hardly used ???


if (tab->select)  {// type SQL_SELECT
   print  "select used" ;

    tab->select->quick->dbug_dump(18, FALSE);
    // from quick->dbug_dump() :
    fprintf(DBUG_FILE, "%*squick range select, key %s, length: %d\n",
      indent, "", head->key_info[index].name, max_used_key_length);
   // in table:   KEY   *key_info;  // data of keys in database
   // 
}

Note: Why quick select and non-quick select ??? Unnecessary complications.

Note: st_stable contains Field ** and keys etc information.
      It is very much possible to lookup keys from there.


Example 2:

CREATE TABLE `t3` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `j` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
)

CREATE TABLE `t4` (
  i int, j int, k int,
  PRIMARY KEY (i),  unique key(j)
) 

CREATE TABLE t5 ( i int, j int, k int,
  PRIMARY KEY (i,j), key(k)
);

mysql> select * from t4, t5 where t5.i = t4.i;

Info about JOIN
 t4                type: ALL      q_keys: 1  refs: 0  key: -1  len: 0
 t5                type: ref      q_keys: 1  refs: 1  key: 0  len: 4
                   select used

mysql> explain extended select * from t4, t5 where t5.i = t4.i;
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+-------+
| id | select_type | table | type | possible_keys | key     | key_len | ref       | rows | filtered | Extra |
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+-------+
|  1 | SIMPLE      | t4    | ALL  | PRIMARY       | NULL    | NULL    | NULL      |    3 |   100.00 |       |
|  1 | SIMPLE      | t5    | ref  | PRIMARY       | PRIMARY | 4       | test.t4.i |    1 |   100.00 |       |
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+-------+

2 tables can be joined with "dependent" condition or "independent condition" (eg. t4.i=3 and t5.i=5)
How do we figure out the dependency here ? See the ref column:  test.t4.i
This means that the t5 is nested loop joined with t4 with this equality condition.
The explain does not reveal this is the t5.i (partial primary key) that is being used to compare against!


mysql> explain extended select t4.i, t4.j from t3, t4, t5  where t5.i = t4.i and t3.id > t4.i;
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+--
| id | select_type | table | type | possible_keys | key     | key_len | ref       | rows | filtered | Extra                                |
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+-
|  1 | SIMPLE      | t4    | ALL  | PRIMARY       | NULL    | NULL    | NULL      |    3 |   100.00 |
|  1 | SIMPLE      | t5    | ref  | PRIMARY       | PRIMARY | 4       | test.t4.i |    1 |   100.00 | Using index
|  1 | SIMPLE      | t3    | ALL  | PRIMARY       | NULL    | NULL    | NULL      |  516 |   100.00 | Range checked
                                                                               for each record (index map: 0x1) |

Note: The table t3 is filtered by dynamic where condition "t3.id > t4.i".


Example:
mysql> explain extended select * from t3, t4, t5  where t5.i = t4.i and t3.id > t4.i and t3.id > 4 ;
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+--
| id | select_type | table | type | possible_keys | key     | key_len | ref       | rows | filtered | Extra
+----+-------------+-------+------+---------------+---------+---------+-----------+------+----------+
|  1 | SIMPLE      | t4    | ALL  | PRIMARY       | NULL    | NULL    | NULL      |    3 |   100.00 |
|  1 | SIMPLE      | t5    | ref  | PRIMARY       | PRIMARY | 4       | test.t4.i |    1 |   100.00 |
|  1 | SIMPLE      | t3    | ALL  | PRIMARY       | NULL    | NULL    | NULL      |  516 |    99.42 | Using where;
                                                                                             Using join buffer |

Note: Does this mean the prev query does not use join buffer ? check this.


mysql> explain extended select t4.i, t4.j from t3, t4, t5  where t5.i = t4.i and  
                      t4.i > (select j from t3 where t3.id = 4) ;
+----+-------------+-------+-------+---------------+---------+---------+-----------+------+----------+-
| id | select_type | table | type  | possible_keys | key     | key_len | ref       | rows | filtered | Extra
+----+-------------+-------+-------+---------------+---------+---------+-----------+------+----------+
|  1 | PRIMARY     | t4    | range | PRIMARY       | PRIMARY | 4       | NULL      |    1 |   100.00 | Using index
                                                                                            condition; Using MRR |
|  1 | PRIMARY     | t5    | ref   | PRIMARY       | PRIMARY | 4       | test.t4.i |    1 |   100.00 | Using index
|  1 | PRIMARY     | t3    | index | NULL          | PRIMARY | 4       | NULL      |  516 |   100.00 | Using index;
                                                                                              Using join buffer 
|  2 | SUBQUERY    | t3    | const | PRIMARY       | PRIMARY | 4       |           |    1 |   100.00 |
+----+-------------+-------+-------+---------------+---------+---------+-----------+------+----------+-

What is the order of execution ? All constant tables are pre-evaluated.
Note: const table is pre-executed and the "select j from t3 where t3.id =4" is replaced by "5".
      The t5 where condition could be appended to include (t5.i > 5) and t3 could be dropped from
      the plan altogether. But that is not the case above.

The TEST_join output for the above query:

t4                type: ALL      q_keys: 1  refs: 0  key: -1  len: 0
                  quick select used:
                  quick range select, key PRIMARY, length: 4
t5                type: ref      q_keys: 1  refs: 1  key: 0  len: 4
                  select used
test.t4.i  
t3                type: index    q_keys: 0  refs: 0  key: -1  len: 0


mysql> explain extended select t4.i, t4.j from t3, t4, t5  where t5.i = t4.i and  
                      t4.i > (select j from t3 where t3.id = t5.i) ;

+----+--------------------+-------+--------+---------------+---------+---------+-----------+------+ ----------+
| id | select_type  | table | type   | possible_keys | key     | key_len | ref       | rows |  filtered | Extra
+----+--------------------+-------+--------+---------------+---------+---------+-----------+------+ ----------+
|  1 | PRIMARY      | t5    | index  | PRIMARY       | PRIMARY | 8       | NULL      |    2 |    100.00 | Using where;
                                                                                                          Using index
|  1 | PRIMARY      | t3    | index  | NULL          | PRIMARY | 4       | NULL      |    3 |    100.00 | Using index;
                                                                                                       Using join buffer
|  1 | PRIMARY      | t4    | eq_ref | PRIMARY       | PRIMARY | 4       | test.t5.i |    1 |    100.00 | 
|  2 |DEPENDENT SUBQ| t3    | eq_ref | PRIMARY       | PRIMARY | 4       | test.t5.i |    1 |    100.00 |
+----+--------------+-------+--------+---------------+---------+---------+-----------+------+ ----------+--

TEST_join output :


Info about JOIN
t5                type: index    q_keys: 1  refs: 0  key: -1  len: 0
                  select used
t3                type: index    q_keys: 0  refs: 0  key: -1  len: 0
t4                type: eq_ref   q_keys: 1  refs: 1  key: 0  len: 4
                  select used    ;                  test.t5.i


Info about JOIN
t3                type: eq_ref   q_keys: 1  refs: 1  key: 0  len: 4
                  select used    ;            test.t5.i

There are 2 joins.



Example: 

mysql> explain  select * from t3, t4, t5 where t3.j = t4.j and t4.i = t5.i ;
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+-------------------+
| id | select_type | table | type   | possible_keys | key     | key_len | ref       | rows | Extra             |
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+-------------------+
|  1 | SIMPLE      | t5    | ALL    | PRIMARY       | NULL    | NULL    | NULL      |    2 |                   | 
|  1 | SIMPLE      | t3    | ALL    | NULL          | NULL    | NULL    | NULL      |    3 | Using join buffer | 
|  1 | SIMPLE      | t4    | eq_ref | PRIMARY,j     | PRIMARY | 4       | test.t5.i |    1 | Using where       | 
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+-------------------+

Info about JOIN
t5                type: ALL      q_keys: 1  refs: 0  key: -1  len: 0  
t3                type: ALL      q_keys: 0  refs: 0  key: -1  len: 0  
t4                type: eq_ref   q_keys: 3  refs: 1  key: 0  len: 4  
                  select used

mysql>   select * from t3, t4, t5 where t3.j = t4.j and t4.i = t5.i ;
+----+------+---+------+------+---+---+------+
| id | j    | i | j    | k    | i | j | k    |
+----+------+---+------+------+---+---+------+
|  1 |    2 | 1 |    2 |    3 | 1 | 2 |    3 | 
|  2 |    3 | 2 |    3 |    4 | 2 | 3 |    4 | 
+----+------+---+------+------+---+---+------+

Recap:
                  join->prepare(...);{
                        setup_tables_and_check_access(); // basic setup
                  }
                  join->flatten_subqueries();
                  join->optimize();
                  join->exec() {
                  do_select()=> sub_select(){
                            join_init_read_record(JOIN_TAB *tab){ ...}
                      }
                  }



mysql -u root test < &fields,
         COND *conds, uint og_num,  ORDER *order, ORDER *group,
         Item *having, ORDER *proc_param, ulonglong select_options,
         select_result *result, SELECT_LEX_UNIT *unit, // every subselect has a unit
         SELECT_LEX *select_lex)

    select_lex->join is NULL; It would be non-null if there is derived table. (select in from table)
    new JOIN();
    join->prepare(...){
       much of true JOIN initialization happens here by copying the args for select_lex, conds, etc
       select_lex -> join = this;
       // top most join list. Why represented using List instead of TABLE_LIST ?
       join_list = &select_lex->top_join_list; // List *
       setup_tables_and_check_access() { // basic setup
           Name_resolution_context is select_lex.context; This is used to resolve names.
           TABLE_LIST contains next_local pointers, merge_underlying_list children, next_leaf ptr.
           Fix next_leaf pointers properly.

           TABLE_LIST represents a table in a join. 
           // Note: See sql_select.cc:  TABLE_LIST::print() method !!!
           TABLE_LIST { 
               ...; 
               uint          outer_join;             /* Which join type */
               table_map     dep_tables;             /* tables the table depends on      */   
               table_map     on_expr_dep_tables;     /* tables on expression depends on  */
               struct st_nested_join *nested_join;   /* if the element is a nested join  */
               TABLE_LIST *embedding;             /* nested join containing the table */
               List *join_list;/* join list the table belongs to   */   
               bool          internal_tmp_table;
               void print(THD *thd, String *str, enum_query_type query_type); //
               ...
           }
         For this query: select * from t3, t4, t5 where t3.j = t4.j and t4.i = t5.i ;
           join->tables_list = TABLE_LIST node of "t3" table only.
           t3-TABLE_LIST -> join_list = List* = [t3, t4, t5] list;
           select_lex->top_join_list =  List* = [t3, t4, t5] list;
         The  "t3, t4, t5" from clause tables considered to be top level unnested tables.
           t3-TABLE_LIST -> embedding = NULL;
         TABLE_LIST nodes are connected through next_leaf pointers for leaf nodes.
       }
Note: Item **ref_pointer_array[] is significant concept. It contains all references 
     from select, order by, group by fields. Initially setup through parsing, then 
     later semantic checking.
     fields_list = List
          setup_wild() { .. select_lex->item_list = fields; after replacing *, etc. ; }
       st_select_lex::fix_prepare_information() => saves where and having conditions 
                               in prep_where and prep_having variables for reuse.

    } // end of join->prepare()
    join->flatten_subqueries();
    join->optimize(){
       build_bitmap_for_nested_joins(); ==> is it required? why?
       ...
       make_join_statistics(); // JOIN_TABs are set here.

    }
    
    join->exec(){

       curr_join->examined_rows = 0;

       thd_proc_info(thd, "Sending data");  
       result->sendfields();
       do_select() {  // operates on "join"
          // assuming t3, t5, t4 order ...
          join_tab[2].next_select = end_select; // = end_send();
          // Note: end_send() invoked at the end of each nested loop iteration.
          // start processing from non-constant tables ...
          join_tab=join->join_tab+join->const_tables;
          If all tables are constant tables{
            call end_select() directly.
            join->examined_rows++;  join->thd->row_count++;
          }

          // sub_select() operates on join_tab
          rc = sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records = 0) {

              first join_tab is for table "t3";
              READ_RECORD *info= &join_tab->read_record;
              join->return_tab = join_tab;  // i.e. "t3"
              join->thd->row_count = 0;
              (*join_tab->read_first_record)(join_tab); { // i.e. join_init_read_record()
                 init_read_record(READ_RECORD *info, ...)  { 
                    // Fillup info structure READ_RECORD *info
                    ...
                    info->table = table;
                    info->file  = table->file; // Handler instance
                    info->read_record =  rr_sequential;
                    table->file->ha_rnd_init(bool scan = 1); // this calls rnd_init(1);
                    table->file->extra_opt(HA_EXTRA_CACHE, readbufsize=128K);
                    // cond_push retried here if it had not been pushed already earlier.
                    table->file->cond_push(select->cond); // unnecessary ????
                 }
                 read record using proper function. ie. rr_sequential(READ_RECORD *info) {
                   // Read only one row until successful.
                   do {
                      int tmp = info->file->rnd_next(info->record);
                   } while (tmp != 0);
                 }
              } // end of read_first_record i.e. join_init_read_record() 

              rc= evaluate_join_record(join, join_tab, error) {
                 //  is this outer join related ? no.
                 join_tab -> found_match = TRUE;
                 // is this semi join ? no.

                 // Call next_select() function. i.e. sub_select_cache() on
                 // next table. Remember, it is t3, t5, t4.
                 rc= (*join_tab->next_select)(join, join_tab+1, end_of_records = 0) {
                 // This is sub_select_cache(JOIN *join, JOIN_TAB *join_tab, end_of_recs)
                    store_record_in_cache((JOIN_CACHE *)&join_tab->cache);
                    // This stores partial record read from t3 to t5's join_tab cache!!!?
                 }
              }


              while (rc == NESTED_LOOP_OK && join->return_tab >= join_tab)
              {
                   error = info->read_record(info);
                   rc = evaluate_join_record(join, join_tab, error);
              }
              if (rc == no more rows && outer join recognized with no matching rows){
                   // i.e.  join_tab->last_inner && !join_tab->found
                   rc = evaluate_null_complemented_join_record(join, join_tab);
              }

              // rc is NESTED_LOOP_NO_MORE_ROWS. Time to exit from sub_select();
              return from sub_select() with rc = NESTED_LOOP_OK;

              // Step 1: All t3's rows are read and put in cache of t5 (next join_tab)
              // 

          } // End of sub_select() with end_of_records = false

          // Again call sub_select() this time indicating end of records reached:
          sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records = 1) {


              rc= (*join_tab->next_select)(join, join_tab+1, end_of_records = 1){
                   // i.e.  This is sub_select_cache() invoked with end of record=true

                flush_cached_records(join, join_tab, skip_last=FALSE){

                   join_init_read_record(join_tab); // now second table (t5) first row read

                   For first record and then for each record of table t5: {

                     reset_cache_read(&join_tab->cache); // read cache pointer set to start.

                     For each cache entry in join_tab (for table t5) {
                      read_cached_record(join_tab); // 
                      // Now join_tab(t5) (for second table) contains first table(t3)'s rows

                      // Now call sub_select() with end_of_records = false on 3rd table (t4)
                      rc= (join_tab->next_select)(join,join_tab+1,0){

                         error= (*join_tab->read_first_record)(join_tab){
                            // This is join_read_key() on table t4 
                            // eq_ref access method "read_first" function.
                            // It has one element cache (as against "ref" method)
                            table->file->ha_index_init(tab->ref.key, tab->sorted);
                            cmp_buffer_with_ref(join_tab){
                               create key and check if the record has already been read
                               for the given key;
                            }
                            // Read the record (eq_ref) into record[0] if needed:
                            table->file->index_read_map(table->record[0], tab->ref.key_buff,
                                  make_prev_keypart_map(tab->ref.key_parts), HA_READ_KEY_EXACT);
                         }

                         rc= evaluate_join_record(join, join_tab, error){ // on last table t4
                            COND *select_cond = join_tab->select_cond ; 
                            // t4.j = t3.j is the condition.  
                            // t4.i=t5.i already taken care of by eq_ref access method
                            check the condition by: select_cond->val_int() 
                            // First row of t5 : condition not satisfied.
                            // Second row of t5 : condition satisfied ?
                            if (satisfied condition){
                                // The final next_select points to end_send()
                                (*join_tab->next_select)(join_tab of t4+1, end=0) { // end_send()
                                   join->result->send_data(){
                                      // Result field points to table->record[0]
                                      // Item-field->val_str() returns string representation
                                   }
                                }
                            }
                         }

   
                         // For last table t4, accessing rest of table is not required. 
                         // eq_ref has accessed the one and only row already.
                         // info->read_record is set to join_no_more_records()
                         // to disable reading rest of table t4.
                         while (rc == NESTED_LOOP_OK && join->return_tab >= join_tab) {
                             // dummy loop
                             error= info->read_record(info); // join_no_more_records();
                             rc= evaluate_join_record(join, join_tab, error); // return 
                         }

                      } // end of sub_select() with endofrecords=false on 3rd table (t4)
                    } // For each cache entry
                   } // For every record of this table
                 } // end of flush_cached_records()
              } // end of sub_select_cache() with end of records = 1
          } // End of sub_select() with end of records = true

        join->join_free() // Free all join resources such as join buffers, etc.
          For each join tab join_tab->cleanup():
            table->file->ha_index_or_rnd_end();
            table->reginfo.join_tab = 0;  // Disassociate table from join_tab
          mysql_unlock_read_tables(thd, lock);           // Don't free join->lock
            unlock_external(thd,table,3);    // All 3 external tables are unlocked
               For table in t3,t4,t5  table->file->ha_external_lock(thd, F_UNLCK); 
          select_send::send_eof()
            ha_release_temporary_latches(thd):
              call release_temporary_latches() for each plugin:
              i.e. hton->release_temporary_latches(hton, thd);
            mysql_unlock_tables(thd, thd->lock);           // 
              [To free other (write) tables. In this case all tables unlocked already]
       } // End of do_select()
    } // End of join->exec()


TableName            t3                        t5                      t4

join_tab->         
read_first_record    join_init_read_record()   join_init_read_record() join_read_key

info->read_record    rr_sequential()           rr_sequential()         join_no_more_records

join_tab->        
next_select          sub_select_cache()        sub_select()            end_send()


Note:  make_join_readinfo() sets up the join_tab next_select etc links.


SequenceOfCalls:

  joining order:  t3, t5, t4

do_select :

  sub_select(t3, end=0)
    For each record, read  (rr_sequential()) and call  evaluate_join_record(t3) {
         sub_select_cache(t5, end=0) {  // because it is next_select ptr of t3
             store t3's records in t5's join_tab->cache 
         }
    }

  sub_select(t3, end=1)
    sub_select_cache(t5, end=1)  // because it is next_select ptr of t3
       flush_cached_records(t5)
          For every record of t5 do:
             Read each cache-entry  (in t5's cache)
                sub_select(t4, end=0); // t5's next_select pointer is sub_select
                   join_read_key(); // eq_ref access method. This is read_first_record ptr.
                      table->file->ha_index_init(tab->ref.key, tab->sorted); // specify index
                      table->file->index_read_map(table->record[0],...);
                   evaluate_join_record(t4);
                      Check for t4.j = t3.j condition. Return row if satisfied.
       sub_select(t5, end=1)
          sub_select(t4, end=1)  // because it is next_select ptr of t5
             end_send(end=1)  // because it is next_select ptr of t4
       
//do_select() end.

Debug Tip:  print *thd; cut and paste to  /tmp/thd ; Use the pointers for reference.

digraph G {

// size="7,8!";
 center=true;
 ranksep=0.8;
 nodesep=0.8;
 style=bold;
 edge  [fontsize=20,style=bold,color=green, fontcolor=blue];
 node [shape=record,fontsize=20,color=red, fontcolor=blue];


}


Also understand:
mysql>  explain select t3.* from t3, t4, t5 where t3.j = t4.j and t4.i = t5.i group by j ;
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+----------------------------------------------+
| id | select_type | table | type   | possible_keys | key     | key_len | ref       | rows | Extra                                        |
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+----------------------------------------------+
|  1 | SIMPLE      | t5    | index  | PRIMARY       | PRIMARY | 8       | NULL      |    2 | Using index; Using temporary; Using filesort | 
|  1 | SIMPLE      | t3    | ALL    | NULL          | NULL    | NULL    | NULL      |    3 | Using join buffer                            | 
|  1 | SIMPLE      | t4    | eq_ref | PRIMARY,j     | PRIMARY | 4       | test.t5.i |    1 | Using where                                  | 
+----+-------------+-------+--------+---------------+---------+---------+-----------+------+----------------------------------------------+


Note: Between optimization and execution, are the tables locked ?  Yes.


Goals :

Create an FAQ for the list of things that you would like to access.
Provide access functions for that :

Purpose:  Given table t, return the list of keys :
Solution:  1) process t->field  (of type FIELD **) and return a subset of keys?
           2) just return t->key_info ? 
           Which is better ?

Purpose:  Given a query, display the join table hierarchy in tree format.
Solution: You get a JOIN with JOIN_TAB structures. Explore how to traverse and
          print this info. Study TEST_join() 

/**  MySQL Join syntax:  LINE 3225 */

MySQL Join syntax:

   select * from t1, t2 ON t1.a = t2.a ; ==> Inner join
   select * from t1 left join t2 ON t1.a = t2.a ; ==> Outer join (left is default)
   SELECT * FROM t1 NATURAL RIGHT JOIN t2; ==> similar to left join. 
                                               Better to use left join for portability.
   select * from t1 natural join t2 ; join on all common columns equality
   select * from t1, t2  using (a);  Inner join with t1.a = t2.a
   select * from t1 natural left [outer] join t2; 
   select * from t1 cross join t2 on (t1.a = t2.a); MySQL inner=cross; this is OK.
                                   ; In std sql, ON is not allowed with cross join
   SELECT * FROM t1 LEFT JOIN (t2, t3, t4)
                 ON (t2.a=t1.a AND t3.b=t1.b AND t4.c=t1.c)

   select * from  t1 straight_join t2 where t1.a = t2.a ; // semantis is same as inner join.
                     // optimizer hint: read t1 before t2

   Following 2 queries yield same rows but number of columns differ:
   select * from t1 LEFT JOIN t2 ON  t1.c1 = t2.c1 and t1.c2 = t2.c2 ; // 2+2 columns output
   select * from t1 LEFT JOIN t2 USING (c1,c2);  // Only 2 columns in output

   
   SELECT * FROM t1 NATURAL JOIN t2 NATURAL JOIN t3; // Earlier this was interpreted as
             // (t1 natural join (t2 natural join t3)) i.e. common column requirement with 
             // t1 and t3 was ignored. Now right interpretation is Natural Join(t1,t2,t3)
             // where all common columns should match.

  Note: Full outer join not supported by mysql.


Note: The precedence of "comma" operator is less than "join" in from clause.


Table in from clause:
   SELECT * FROM (SELECT 1, 2, 3) AS t1;
   
Index Hints:
  SELECT * FROM table1 USE INDEX (col1_index,col2_index)
            WHERE col1=1 AND col2=2 AND col3=3;

  SELECT * FROM t1 USE INDEX (i1) IGNORE INDEX FOR ORDER BY (i2) ORDER BY a;

  SELECT table1.*,table2.* FROM table1 USE INDEX (col2_index), table2 
            WHERE table1.col1=table2.col1 AND table1.col2=2 AND table1.col3=3;

MySQL specific hints:
   SELECT /*! STRAIGHT_JOIN */ col1 FROM table1,table2 WHERE ...

DEFN: SELECT syntax
SELECT
    [ALL | DISTINCT | DISTINCTROW ]
      [HIGH_PRIORITY]
      [STRAIGHT_JOIN]
      [SQL_SMALL_RESULT] [SQL_BIG_RESULT] [SQL_BUFFER_RESULT]
      [SQL_CACHE | SQL_NO_CACHE] [SQL_CALC_FOUND_ROWS]
    select_expr, ...
    [FROM table_references
    [WHERE where_condition]
    [GROUP BY {col_name | expr | position}
      [ASC | DESC], ... [WITH ROLLUP]]
    [HAVING where_condition]
    [ORDER BY {col_name | expr | position}
      [ASC | DESC], ...]
    [LIMIT {[offset,] row_count | row_count OFFSET offset}]
    [PROCEDURE procedure_name(argument_list)]
    [INTO OUTFILE 'file_name' export_options
      | INTO DUMPFILE 'file_name'
      | INTO var_name [, var_name]]
    [FOR UPDATE | LOCK IN SHARE MODE]]


=======================================================================

/** Postgres Vs MySQL Comparision  LINE 3300 */


Notes on postgres compared to mysql:

- postgres is in C, not C++
- The code is mostly very modular
- As a comparison, look at this from postgres :

"src/include/nodes/parsenodes.h"
typedef struct SelectStmt
{
        NodeTag         type;

        /*
         * These fields are used only in "leaf" SelectStmts.
         */
        List       *distinctClause; /* NULL, list of DISTINCT ON exprs, or
                                                                 * lcons(NIL,NIL) for all (SELECT DISTINCT) */
        IntoClause *intoClause;         /* target for SELECT INTO / CREATE TABLE AS */
        List       *targetList;         /* the target list (of ResTarget) */
        List       *fromClause;         /* the FROM clause */
        Node       *whereClause;        /* WHERE qualification */
        List       *groupClause;        /* GROUP BY clauses */
        Node       *havingClause;       /* HAVING conditional-expression */

        /*
         * In a "leaf" node representing a VALUES list, the above fields are all
         * null, and instead this field is set.  Note that the elements of the
         * sublists are just expressions, without ResTarget decoration. Also note
         * that a list element can be DEFAULT (represented as a SetToDefault
         * node), regardless of the context of the VALUES list. It's up to parse
         * analysis to reject that where not valid.
         */
        List       *valuesLists;        /* untransformed list of expression lists */

        /*
         * These fields are used in both "leaf" SelectStmts and upper-level
         * SelectStmts.
         */
        List       *sortClause;         /* sort clause (a list of SortBy's) */
        Node       *limitOffset;        /* # of result tuples to skip */
        Node       *limitCount;         /* # of result tuples to return */
        List       *lockingClause;      /* FOR UPDATE (list of LockingClause's) */

        /*
         * These fields are used only in upper-level SelectStmts.
         */
        SetOperation op;                        /* type of set op  */
        bool            all;                    /* ALL specified? */
        struct SelectStmt *larg;        /* left child */
        struct SelectStmt *rarg;        /* right child */
        /* Eventually add fields for CORRESPONDING spec here */
} SelectStmt;


=======================================================================


Example:

   student : (studentid : integer, name: varchar(10))
   course  : (courseid: integer, subject : varchar(20))
   enrol   : (studentid: int, courseid:int, primarykey(studentid, courseid), year: int)
   

CREATE TABLE `t1` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `nonkey` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
);

create table t2 like t1;

Simple Range Example :

drop table if exists t1;
drop table if exists t2;

-- create table t1 (pkey integer primary key, uniq integer unique, akey integer key, nonkey integer);
create table t1 (pkey integer primary key, nonkey integer);
create table t2 like t1;

insert into t1 values (1, 2), (2, 3), (3, 4), (4,5);
insert into t2 select * from t1;

insert into t1(pkey,nonkey) select 4+pkey, 4+nonkey from t1;
insert into t1(pkey,nonkey) select 8+pkey, 8+nonkey from t1;
insert into t1(pkey,nonkey) select 16+pkey, 16+nonkey from t1;
insert into t1(pkey,nonkey) select 32+pkey, 32+nonkey from t1;
insert into t1(pkey,nonkey) select 64+pkey, 64+nonkey from t1;
insert into t1(pkey,nonkey) select 128+pkey, 128+nonkey from t1;
insert into t1(pkey,nonkey) select 256+pkey, 256+nonkey from t1;
insert into t1(pkey,nonkey) select 512+pkey, 512+nonkey from t1;

select  * from t1 where pkey > 0 and pkey < 10;

explain   select  * from t1 where t1.pkey in (select pkey from t2 where t2.pkey < t1.pkey+dayofmonth(now()));
explain   select  * from t1, t2 where t1.pkey = t2.nonkey and t1.nonkey in (select pkey from t2 where nonkey=t1.nonkey);
   select  * from t1 where t1.nonkey in (select id from t2);



Tracing Range Query :    /** LINE 3410 */

Useful Links : 
   
    http://forge.mysql.com/wiki/MySQL_Internals_Custom_Engine
    https://inside.mysql.com/wiki/DocSE


  select * from t1 where pkey > 0 and pkey < 10;


                join->prepare(...);{
                   setup_tables_and_check_access(); // basic setup
                }
                join->flatten_subqueries();
                join->optimize();
                join->exec() {
                    do_select()=> sub_select(){
                       join_init_read_record(JOIN_TAB *tab){
                          records.cc:init_read_record() {
                              info->read_record = rr_sequential;
                              table->file->ha_rnd_init(1);
                              // advice SE to cache the scan results
                              table->file->extra_opt(HA_EXTRA_CACHE,
                                  thd->variables.read_buff_size);
                              table->file->cond_push(select->cond);
                              call rr_sequential()

                              evaluate_join_record(join, join_tab, error){
                                  join->result->send_data(*join->fields);
                              }
                          }
                       }
                    }
                }
             }
         }


  select * from t2 where nonkey in (select idx from t2);

(gdb) bt
#0  setup_table_map (table=0xa52b958, table_list=0xa524698, tablenr=0) at mysql_priv.h:2370
#1  0x0824c158 in setup_tables (thd=0xa4d6bf0, context=0xa4d7fb0, from_clause=0xa4d8040, tables=0xa524698, leaves=0xb6ca6d50, select_insert=false) at sql_base.cc:7515
#2  0x0824c3f4 in setup_tables_and_check_access (thd=0xa4d6bf0, context=0xa4d7fb0, from_clause=0xa4d8040, tables=0xa524698, leaves=0xa4d8054, select_insert=false, want_access_first=1, want_access=1) at sql_base.cc:7589
#3  0x082898d8 in JOIN::prepare (this=0xa525110, rref_pointer_array=0xa4d8074, tables_init=0xa524698, wild_num=1, conds_init=0xa524fc0, og_num=0, order_init=0x0, group_init=0x0, having_init=0x0, proc_param_init=0x0, select_lex_arg=0xa4d7f7c, unit_arg=0xa4d7d0c) at sql_select.cc:483
#4  0x0828a7c9 in mysql_select (thd=0xa4d6bf0, rref_pointer_array=0xa4d8074, tables=0xa524698, wild_num=1, fields=@0xa4d8010, conds=0xa524fc0, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147764736, result=0xa5250f8, unit=0xa4d7d0c, select_lex=0xa4d7f7c) at sql_select.cc:2349
#5  0x0828fbfb in handle_select (thd=0xa4d6bf0, lex=0xa4d7cb0, result=0xa5250f8, setup_tables_done_option=0) at sql_select.cc:269
#6  0x082029d1 in execute_sqlcom_select (thd=0xa4d6bf0, all_tables=0xa524698) at sql_parse.cc:4890
#7  0x08204362 in mysql_execute_command (thd=0xa4d6bf0) at sql_parse.cc:2184
#8  0x0820d056 in mysql_parse (thd=0xa4d6bf0, inBuf=0xa524520 "select * from t2 where nonkey in (select idx from t2)", length=53, found_semicolon=0xb6ca826c) at sql_parse.cc:5789
#9  0x0820dc49 in dispatch_command (command=COM_QUERY, thd=0xa4d6bf0, packet=0xa51c4c1 "select * from t2 where nonkey in (select idx from t2)", packet_length=53) at sql_parse.cc:1200
#10 0x0820ee0f in do_command (thd=0xa4d6bf0) at sql_parse.cc:857
#11 0x081fbbd1 in handle_one_connection (arg=0xa4d6bf0) at sql_connect.cc:1115
#12 0x0081443b in start_thread () from /lib/libpthread.so.0
#13 0x0076bfde in clone () from /lib/libc.so.6

  The TABLE_LIST *tables; contains 2 instances of t2.
    First instance's t2->next_local = null;
    setup_tables() fixes top level t2 only !?

  The first instance of table t2 has table->map =  1 << 0 ; 

The second instance is processed with the fields (subqueries) are processed later.

JOIN::prepare(){
   counts total number of tables as 1 !;
   ....
   Note: The recursive JOIN::prepare() !!!
   The second JOIN::prepare() tables_init contains only one instance of t2!

   If this is subquery. ie. select_lex->master_unit()->item!=null {
       subselect->select_transformer(this) {
           left_expr->fix_fields(); // {
              // Resolve name of column reference ....
             // The method resolves the column reference represented by 'this' as a column
             // present in one of: FROM clause, SELECT clause, GROUP BY clause of a query
             // Q, or in outer queries that contain Q.
           }
       }
   }
  
}

Breakpoint 7, JOIN::prepare (this=0xa52cf08, rref_pointer_array=0xa524a10, tables_init=0xa524dd8, wild_num=0, conds_init=0x0, og_num=0, order_init=0x0, group_init=0x0, having_init=0x0, proc_param_init=0x0, select_lex_arg=0xa524918, unit_arg=0xa524aa0) at sql_select.cc:456
#0  JOIN::prepare (this=0xa52cf08, rref_pointer_array=0xa524a10, tables_init=0xa524dd8, wild_num=0, conds_init=0x0, og_num=0, order_init=0x0, group_init=0x0, having_init=0x0, proc_param_init=0x0, select_lex_arg=0xa524918, unit_arg=0xa524aa0) at sql_select.cc:456
#1  0x081a66ca in subselect_single_select_engine::prepare (this=0xa525070) at item_subselect.cc:1756
#2  0x081aaba2 in Item_subselect::fix_fields (this=0xa524fc0, thd_param=0xa4d6bf0, ref=0xa5262a4) at item_subselect.cc:158
#3  0x081aae91 in Item_in_subselect::fix_fields (this=0xa524fc0, thd_arg=0xa4d6bf0, ref=0xa5262a4) at item_subselect.cc:1624
#4  0x08249e36 in setup_conds (thd=0xa4d6bf0, tables=0xa524698, leaves=0xa524698, conds=0xa5262a4) at sql_base.cc:7937
#5  0x082919c1 in setup_without_group (thd=0xa4d6bf0, ref_pointer_array=0xa526448, tables=0xa524698, leaves=0xa524698, fields=@0xa4d8010, all_fields=@0xa526230, conds=0xa5262a4, order=0x0, group=0x0, hidden_group_fields=0xa526213) at sql_select.cc:413
#6  0x08289abf in JOIN::prepare (this=0xa525110, rref_pointer_array=0xa4d8074, tables_init=0xa524698, wild_num=1, conds_init=0xa524fc0, og_num=0, order_init=0x0, group_init=0x0, having_init=0x0, proc_param_init=0x0, select_lex_arg=0xa4d7f7c, unit_arg=0xa4d7d0c) at sql_select.cc:495
#7  0x0828a7c9 in mysql_select (thd=0xa4d6bf0, rref_pointer_array=0xa4d8074, tables=0xa524698, wild_num=1, fields=@0xa4d8010, conds=0xa524fc0, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147764736, result=0xa5250f8, unit=0xa4d7d0c, select_lex=0xa4d7f7c) at sql_select.cc:2349
#8  0x0828fbfb in handle_select (thd=0xa4d6bf0, lex=0xa4d7cb0, result=0xa5250f8, setup_tables_done_option=0) at sql_select.cc:269
#9  0x082029d1 in execute_sqlcom_select (thd=0xa4d6bf0, all_tables=0xa524698) at sql_parse.cc:4890
#10 0x08204362 in mysql_execute_command (thd=0xa4d6bf0) at sql_parse.cc:2184
#11 0x0820d056 in mysql_parse (thd=0xa4d6bf0, inBuf=0xa524520 "select * from t2 where nonkey in (select idx from t2)", length=53, found_semicolon=0xb6ca826c) at sql_parse.cc:5789
#12 0x0820dc49 in dispatch_command (command=COM_QUERY, thd=0xa4d6bf0, packet=0xa51c4c1 "select * from t2 where nonkey in (select idx from t2)", packet_length=53) at sql_parse.cc:1200
#13 0x0820ee0f in do_command (thd=0xa4d6bf0) at sql_parse.cc:857
#14 0x081fbbd1 in handle_one_connection (arg=0xa4d6bf0) at sql_connect.cc:1115
#15 0x0081443b in start_thread () from /lib/libpthread.so.0
#16 0x0076bfde in clone () from /lib/libc.so.6

Table List: ptr=a524698
Table: db=test table_name=t2 tableptr=0xa52b958
Table: db=test table_name=t2 tableptr=0xa523480
End of Table List

Table List: ptr=a524dd8
Table: db=test table_name=t2 tableptr=0xa523480
End of Table List


First select lex:   select_lex_arg=0xa4d7f7c
Second select lex:  select_lex_arg=0xa524918


For the subquery which is part of "expression",
following is true for that select_lex : 

  Item_subselect*  item =  select_lex->master_unit()->item  // not null

Note:  SELECT_LEX_UNIT can contain (1 or more) SELECT_LEX (but not other units)
       SELECT_LEX can contain (0 or more)  SELECT_LEX_UNIT and belongs to 1 master unit.

The top SELECT_LEX_UNIT and the first SELECT_LEX are stored in LEX.
The LEX::current_select points to current processed select lex.

   print_select_lex() should print this hierarchy in a graph!

Note:   SELECT_LEX_UNIT contains more than 1 SELECT_LEX implies they are UNIONs.
        SELECT_LEX can contain (ie. refer to) any number of SELECT_LEX_UNIT:
            - Result field. e.g. select (select pkey from t1 where pkey=const), 3, nonkey ...
                        note: subquery should return only one column and one row only.
            - Derived table (subquery in from table. can be single or union)
            - Item in where/ON expression. e.g. val in (select pkey from t1) ;
                         note: subquery should return single column only.

After bit of optimization, this query looks like as:

Select Lex ptr=0xa4d7f7c
select `test`.`t2`.`idx` AS `idx`,`test`.`t2`.`nonkey` AS `nonkey` from `test`.`t2` where (`test`.`t2`.`nonkey`,(select 1 AS `Not_used` from `test`.`t2` where ((`test`.`t2`.`nonkey`) = `test`.`t2`.`idx`)))


When the first row is read, the JOIN::optimize is called for the subquery:

(gdb) bt
#0  JOIN::optimize (this=0xa52cf08) at sql_select.cc:766
#1  0x081a5b6f in subselect_single_select_engine::exec (this=0xa525070) at item_subselect.cc:1880
#2  0x081aaab7 in Item_subselect::exec (this=0xa524fc0) at item_subselect.cc:263
#3  0x081a4382 in Item_in_subselect::val_bool (this=0xa524fc0) at item_subselect.cc:857
#4  0x0813af6a in Item::val_bool_result (this=0xa524fc0) at item.h:731
#5  0x0816bee0 in Item_in_optimizer::val_int (this=0xa52e128) at item_cmpfunc.cc:1607
#6  0x0826d643 in evaluate_join_record (join=0xa525110, join_tab=0xa52e760, error=0) at sql_select.cc:11122
#7  0x0826d9d4 in sub_select (join=0xa525110, join_tab=0xa52e760, end_of_records=false) at sql_select.cc:11077
#8  0x08275e77 in do_select (join=0xa525110, fields=0xa4d8010, table=0x0, procedure=0x0) at sql_select.cc:10833
#9  0x0828f8c7 in JOIN::exec (this=0xa525110) at sql_select.cc:2192
#10 0x0828a872 in mysql_select (thd=0xa4d6bf0, rref_pointer_array=0xa4d8074, tables=0xa524698, wild_num=1, fields=@0xa4d8010, conds=0xa524fc0, og_num=0, order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=2147764736, result=0xa5250f8, unit=0xa4d7d0c, select_lex=0xa4d7f7c) at sql_select.cc:2371
#11 0x0828fbfb in handle_select (thd=0xa4d6bf0, lex=0xa4d7cb0, result=0xa5250f8, setup_tables_done_option=0) at sql_select.cc:269
#12 0x082029d1 in execute_sqlcom_select (thd=0xa4d6bf0, all_tables=0xa524698) at sql_parse.cc:4890
#13 0x08204362 in mysql_execute_command (thd=0xa4d6bf0) at sql_parse.cc:2184
#14 0x0820d056 in mysql_parse (thd=0xa4d6bf0, inBuf=0xa524520 "select * from t2 where nonkey in (select idx from t2)", length=53, found_semicolon=0xb6ca826c) at sql_parse.cc:5789
.....


Note:
  select * from t2 where nonkey in (select idx from t2);
Above query creates single JOIN_TAB for the main query. During execution of this join_tab,
the subquery gets executed when evaluating the where condition.

The number of join_tab's = Number of from tables



Other subqueries to experiment:

   select distinct * from  t1 as first join t1 as second ON (first.key = second.nonkey) ;
   SELECT * FROM t1 JOIN (t2, t3 ) ON (t1.a = t2.b) ; 

How is the table list for the above query ?
                   t1 --> on_expr --> nil
             on_expr ==> t2 --> t3;  t2-->embedding = on_expr


Let us analyze this query :
   select * from  (select * from t2 group by idx) as mytab where mytab.idx > 3;

Note: Derived tables are computed first from open_and_lock_tables_derived()

JOIN::exec stack trace variables (for derived table) :
    table_list = (t2)
    this -> tables == 1
    this->join_tab->table = "table t2"
    join_tab->type = JT_NEXT
    read_first_record = 0x826c324 ,
    simple_order = simple_group = true
    send_group_parts = 1
    this -> group = true

After this exec is done, the main query optimize() etc is done.
For temp table, it is using ha_heap table.

   SELECT * FROM t1 JOIN (t2, t3 ) ON (t1.a = t2.b) ; 

mysql> explain     SELECT * FROM t1 JOIN (t2 JOIN t3 ON (t2.nonkey != t3.nonkey) ) 
         ON (t1.nonkey = t2.nonkey) and (t3.nonkey = 3) and t2.nonkey < 3  ;

+----+-------------+-------+------+---------------+------+---------+------+------+--------------------------------+
| id | select_type | table | type | possible_keys | key  | key_len | ref  | rows | Extra                          |
+----+-------------+-------+------+---------------+------+---------+------+------+--------------------------------+
|  1 | SIMPLE      | t1    | ALL  | NULL          | NULL | NULL    | NULL |    2 | Using where                    |
|  1 | SIMPLE      | t2    | ALL  | NULL          | NULL | NULL    | NULL |    3 | Using where; Using join buffer |
|  1 | SIMPLE      | t3    | ALL  | NULL          | NULL | NULL    | NULL |    3 | Using where; Using join buffer |
+----+-------------+-------+------+---------------+------+---------+------+------+--------------------------------+
3 rows in set (0.01 sec)

 SELECT * FROM t1 JOIN (t2 JOIN t3 ON (t2.nonkey!=t3.nonkey) ) ON (t1.nonkey = t2.nonkey) and (t3.nonkey = 3) and                 t2.nonkey < 4;


where = 0; // where condition is null!
select_lex->top_join_list is List * ;
It's first TABLE_LIST * contains "nested join" info and it is not a leaf table_list node :

    print *(TABLE_LIST*)0xa594ae0
    print ((TABLE_LIST*)0xa594ae0) -> nested_join
      (st_nested_join *) 0xa594cb8

    print  * ( (st_nested_join *) 0xa594cb8 )
 

$14 = {
  next_local = 0x0, 
  next_global = 0x0, 
  prev_global = 0x0, 
  db = 0x0, 
  alias = 0x84f3da0 "(nest_last_join)", 
  table_name = 0x0, 
...
  table = 0x0, 
  table_id = 0, 
  derived = 0x0, 
  select_lex = 0x0, 
  where = 0x0, 
  nested_join = 0xa594cb8,   // top nested join  nested_join
  embedding = 0x0, 
  join_list = 0xa547008,   // contains all 3 tables 
}
(gdb)  print  &( (st_nested_join *) 0xa594cb8 )->join_list
$18 = (List *) 0xa594cb8

      print print_list_table_list((List *) 0xa594cb8)

List of TABLE_LIST: ptr=0xa594cb8
TableList Number: 1
Table List: ptr=a5941b8     ==> see below
Table: db=(null) table_name=(null) tableptr=0x0
End of Table List
TableList Number: 2
Table List: ptr=a593740
Table: db=test table_name=t1 tableptr=0xa569fb8
Table: db=test table_name=t2 tableptr=0xa592470
Table: db=test table_name=t3 tableptr=0xa5a2968
End of Table List
End of List of Table List


   print  *(TABLE_LIST*) 0xa5941b8

(gdb)    print  *(TABLE_LIST*) 0xa5941b8
$20 = {
  alias = 0x84f3da0 "(nest_last_join)", 
  on_expr = 0xa594830, 
  table = 0x0, 
  table_id = 0, 
  where = 0x0, 
  nested_join = 0xa594390, 
  embedding = 0xa594ae0, 
  join_list = 0xa594cb8, 
}

 SELECT * FROM t1 JOIN (t2 JOIN t3 ON (t2.nonkey!=t3.nonkey) ) ON (t1.nonkey = t2.nonkey) and (t3.nonkey = 3) and                 t2.nonkey < 4;

  print  print_item(thd, ((TABLE_LIST*) 0xa5941b8) -> on_expr)
Item: ptr=0xa594830: ((`test`.`t1`.`nonkey` = `test`.`t2`.`nonkey`) and (`test`.`t3`.`nonkey` = 3) and (`test`.`t2`.`nonkey` < 4))

Table list structures are more clear.

Derived Tables Execution :  /** LINE 3700 */

Next Goal: 
      Derived table, sub query representation and execution:

   select * from  t1, (select * from t2 group by idx) as mytab where (mytab.idx < 3) and (t1.nonkey > 1)
                                                          and (t1.nonkey+100 > mytab.idx) ;
mysql> select * from t1;
+------+--------+
| pkey | nonkey |
+------+--------+
|    1 |      2 |
|    2 |      3 |
+------+--------+
2 rows in set (0.00 sec)


mysql> select * from t2;
+------+--------+
| idx  | nonkey |
+------+--------+
|    1 |      2 |
|    2 |      3 |
|    3 |      4 |
+------+--------+
3 rows in set (0.00 sec)

mysql> select * from t3;
+------+--------+
| idx  | nonkey |
+------+--------+
|    1 |      2 |
|    2 |      3 |
|    3 |      4 |
+------+--------+
3 rows in set (0.00 sec)


mysql>  select * from  t1, (select * from t2 group by idx) as mytab where (mytab.idx < 3) and (t1.nonkey > 1)
    ->                                                           and (t1.nonkey+100 > mytab.idx) ;
+------+--------+------+--------+
| pkey | nonkey | idx  | nonkey |
+------+--------+------+--------+
|    1 |      2 |    1 |      2 |
|    2 |      3 |    1 |      2 |
|    1 |      2 |    2 |      3 |
|    2 |      3 |    2 |      3 |
+------+--------+------+--------+
4 rows in set (0.00 sec)


The derived tables are handled here: 
#0  JOIN::exec (this=0x88d1bf0) at sql_select.cc:1763
#1  0x0828aa76 in mysql_select (thd=0x887abf0, rref_pointer_array=0x88c8c28, tables=0x88c8fe8, wild_num=0, 
    fields=@0x88c8bc4, conds=0x0, og_num=1, order=0x0, group=0x88c9268, having=0x0, proc_param=0x0, 
    select_options=2416200192, result=0x88c9ec0, unit=0x88c8cb8, select_lex=0x88c8b30) at sql_select.cc:2371
#2  0x08389808 in mysql_derived_filling (thd=0x887abf0, lex=0x887bcb0, orig_table_list=0x88c94d0)
    at sql_derived.cc:293
#3  0x083895b3 in mysql_handle_derived (lex=0x887bcb0, 
    processor=0x8389638 ) at sql_derived.cc:56
#4  0x08257d72 in open_and_lock_tables_derived (thd=0x887abf0, tables=0x88c8738, derived=true)
    at sql_base.cc:4982
#5  0x0821007a in open_and_lock_tables (thd=0x887abf0, tables=0x88c8738) at mysql_priv.h:1533
#6  0x0820296a in execute_sqlcom_select (thd=0x887abf0, all_tables=0x88c8738) at sql_parse.cc:4855
#7  0x08204596 in mysql_execute_command (thd=0x887abf0) at sql_parse.cc:2184
#8  0x0820d28a in mysql_parse (thd=0x887abf0, 
    inBuf=0x88c8520 "select * from  t1, (select * from t2 group by idx) as mytab where (mytab.idx < 3) and (t1.nonkey > 1) and (t1.nonkey+100 > mytab.idx)", length=133, found_semicolon=0xb6d0526c) at sql_parse.cc:5789
#9  0x0820de7d in dispatch_command (command=COM_QUERY, thd=0x887abf0, 
    packet=0x88c04c1 "select * from  t1, (select * from t2 group by idx) as mytab where (mytab.idx < 3) and (t1.nonkey > 1) and (t1.nonkey+100 > mytab.idx)", packet_length=133) at sql_parse.cc:1200
#10 0x0820f043 in do_command (thd=0x887abf0) at sql_parse.cc:857

JOIN::exec   // select * from t2 group by idx (derived table)
  need_tmp = true already set earlier.
  TABLE *curr_tmp_table= 0;
  do_select(curr_join, (List *) 0, curr_tmp_table, 0); // write into temp table
  // because the derived table part of query contains "group by" temp table is used.
  // temp tables are created at JOIN::optimize stage itself.
   ==> evaluate_join_record() ==> end_write() => 
          copy_fields(&join->tmp_table_param) =>
              do_copy_null(Copy_field *copy) =>  do_field_4(Copy_field *copy)
          ha_write_row(); write the row into temp table

Note: 
- 

Use of SELECT_LEX :
   thd->lex->select_lex
   select_lex-> {  List   item_list; // head list; List  top_join_list; }
   If a TABLE_LIST is a derived table, then TABLE_LIST->derived  is a select_lex_unit
   That select_lex_unit's top_join_list is what ?

For Query:  select * from (select pkey as pk, nonkey as nk from t1) as tab, t2  where tab.nk > 0 ;
3 Table instances are used up! : t1, tab, t2

For the subquery TABLE_LIST->derived(i.e.select_lex_unit).select_lex->top_join_list points to:
Table List ptr: 0x9771880 (t1) 
Linked with global tables: { t1(tlist_0x0x9771880) tab(tlist_0x0x9771f38) t2(tlist_0x0x9772398)  }
Linked with local tables: { t1(tlist_0x0x9771880)  }  // for subquery.

Use of Item expression for  "where (tab.nk>0)" :
     Item_func_gt : Item_func {   Item *args[0,1] = {tab.nk, 0} ; func_name()=">" ; }

Item_field::print(){
   If (field) print field->val_str()    //  For constants like "0"
   else Item_ident::print();            //  For variable ids like "tab.nk"
}

Item_int:Item_ident::print(){ print int value; }

Fix  prints.inc to use  thava_str(buf, sizeof buf, &my_charset_bin);

At the end of  parse_sql(), the tables are already resolved -- unknown tables would
have caused errors. TABLE_LIST structures are setup already.


Understanding Field Class  /** LINE 3810 */

Next Goal:  
  Understand TABLE Field structure and write gdbPrintField routine:

class Field
{
  Field(const Item &);				/* Prevent use of these */
  Field(uchar *ptr_arg,uint32 length_arg,uchar *null_ptr_arg,
        uchar null_bit_arg, utype unireg_check_arg,
        const char *field_name_arg);

public:

  uchar		*ptr;			// Position to field in record
  uchar		*null_ptr;		// Byte where null_bit is

  TABLE *table;                                 // Pointer for table
  TABLE *orig_table;                            // Pointer to original table

  const char	**table_name, *field_name;

  LEX_STRING	comment;

  /* Bitmap of indexes that start with this field */
  key_map	key_start;    /* key_map = BitMap<64>  */
                              /* use:   key_map->print(thava_buf) ; print thava_buf; */


  uint32	field_length;		// Length of field
  uint32	flags;
  uint16        field_index;            // field number in fields array
  uchar		null_bit;		// Bit used to test null bit
  /**
     If true, this field was created in create_tmp_field_from_item from a NULL
     value. This means that the type of the field is just a guess, and the type
     may be freely coerced to another type.

     @see create_tmp_field_from_item
     @see Item_type_holder::get_real_type

   */
  bool is_created_from_null_item;


  /* Store functions returns 1 on overflow and -1 on fatal error */
  virtual int  store(const char *to, uint length,CHARSET_INFO *cs)=0;
  virtual int  store(double nr)=0;
  ....

  virtual double val_real(void)=0;
  virtual longlong val_int(void)=0;
  virtual my_decimal *val_decimal(my_decimal *);
  inline String *val_str(String *str) { return val_str(str, str); }

  virtual String *val_str(String*,String *)=0;

  virtual uint32 pack_length() const { return (uint32) field_length; }

  virtual int compatible_field_size(uint field_metadata,
                                    const Relay_log_info *);
  /* You can have Field to remember its "metadata" information that you ask it to
     remember and use it later.
  */

  virtual uint pack_length_from_metadata(uint field_metadata)
  { return field_metadata; }
.....


  /* is it sorted as binary (unsigned chars) or text (as per char set) or signed long etc. */
  virtual bool binary() const { return 1; }
  virtual enum ha_base_keytype key_type() const { return HA_KEYTYPE_BINARY; }
  ....

  virtual uint32 key_length() const { return pack_length(); }

  virtual enum_field_types type() const =0;

  virtual enum_field_types real_type() const { return type(); }

  inline  int cmp(const uchar *str) { return cmp(ptr,str); }
...
  /*
    Caller beware: sql_type can change str.Ptr, so check
    ptr() to see if it changed if you are using your own buffer
    in str and restore it with set() if needed
  */
  virtual void sql_type(String &str) const =0;
  virtual uint size_of() const =0;		// For new field

  inline bool is_null(my_ptrdiff_t row_offset= 0)
  { return null_ptr ? (null_ptr[row_offset] & null_bit ? 1 : 0) : table->null_row; }


  inline void set_null(my_ptrdiff_t row_offset= 0)
    { if (null_ptr) null_ptr[row_offset]|= null_bit; }


  virtual bool optimize_range(uint idx, uint part);


  virtual void free() {}

  virtual Field *new_field(MEM_ROOT *root, TABLE *new_table,
                           bool keep_type);
  virtual Field *new_key_field(MEM_ROOT *root, TABLE *new_table,
                               uchar *new_ptr, uchar *new_null_ptr,
                               uint new_null_bit);

  Field *clone(MEM_ROOT *mem_root, TABLE *new_table);

  virtual void get_image(uchar *buff, uint length, CHARSET_INFO *cs)
    { memcpy(buff,ptr,length); }
  virtual void set_image(const uchar *buff,uint length, CHARSET_INFO *cs)
    { memcpy(ptr,buff,length); }

  inline longlong val_int_offset(uint row_offset)
    {
      ptr+=row_offset;
      longlong tmp=val_int();
      ptr-=row_offset;
      return tmp;
    }
  ....

  virtual bool send_binary(Protocol *protocol);

  virtual uchar *pack(uchar *to, const uchar *from,
                      uint max_length, bool low_byte_first);

  /**
     @overload Field::pack(uchar*, const uchar*, uint, bool)
  */
  uchar *pack(uchar *to, const uchar *from)
  {
    DBUG_ENTER("Field::pack");
    uchar *result= this->pack(to, from, UINT_MAX, table->s->db_low_byte_first);
    DBUG_RETURN(result);
  }

  virtual const uchar *unpack(uchar* to, const uchar *from,
                              uint param_data, bool low_byte_first);
   .....
   .....

  void copy_from_tmp(int offset);
  uint fill_cache_field(struct st_cache_field *copy);
  virtual bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
  virtual bool get_time(MYSQL_TIME *ltime);

  void init(TABLE *table_arg)
  {
    orig_table= table= table_arg;
    table_name= &table_arg->alias;
  }

  /* maximum possible display length */
  virtual uint32 max_display_length()= 0;


#ifndef DBUG_OFF
  /* Print field value into debug trace, in NULL-aware way. */
  void dbug_print()
  {
    if (is_real_null())
      fprintf(DBUG_FILE, "NULL");
    else
    {
      char buf[256];
      String str(buf, sizeof(buf), &my_charset_bin);
      str.length(0);
      String *pstr;
      pstr= val_str(&str);
      fprintf(DBUG_FILE, "'%s'", pstr->c_ptr_safe());
    }
  }
#endif

  /* Hash value */
  virtual void hash(ulong *nr, ulong *nr2);
  friend int cre_myisam(char * name, register TABLE *form, uint options,
			ulonglong auto_increment_value);
  friend class Copy_field;
  friend class Item_avg_field;
  friend class Item_std_field;
  friend class Item_sum_num;
  friend class Item_sum_sum;
  friend class Item_sum_str;
  friend class Item_sum_count;
  friend class Item_sum_avg;
  friend class Item_sum_std;
  friend class Item_sum_min;
  friend class Item_sum_max;
  friend class Item_func_group_concat;

private:

protected:
};


Notes on Foreign Keys:

Foreign Keys: 
CREATE TABLE child (id INT, parent_id INT,
                    INDEX par_ind (parent_id),
                    FOREIGN KEY (parent_id) REFERENCES parent(id)
                      ON DELETE CASCADE
) ENGINE=INNODB;


Does the foreign key require index on referencing table ? 
    No. The referenced table should have index. However storage engine
like Innodb may create index on referencing table.

Should the foreign key referencing columns be primary/unique ?
    The wikipedia foreign key definition says so. However mysql 
implementation does not require so. The referencing column 
should be the prefix of the index in referenced table. that's all.

The only constraints supported are: [unique] index, key, foreign key, not null. that's it.

Jan 22nd 2009
Tip: Note Inheritance: Item->Item_ident->Item_field

constraints:
Oracle Example:
SQL> create table t
    ( x int constraint 
       check_x check ( x > 0 ) 
       deferrable 
       initially immediate,
     y int constraint 
       check_y check ( y > 0 ) 
       deferrable 
       initially deferred
   )
    Table created.

SQL> set constraints all deferred;
SQL> set constraints all immediate;

ToDO : Query and Variable Correlation 
       This involves query-id and variable-instance-id etc.

What is the difference between store_lock() and external_lock() ?
  store_lock() is called by mysqld before reading/writing the table. 
    requested lock level is like TL_READ, TL_WRITE, TL_WRITE_ALLOW_READ, etc.
    This will be called when releasing the locks as well.
    Storage engine can downgrade/ignore lock. Or add additional locks.
    See  ha_myisammrg::store_lock() for complex implementation.

  external_lock() is called by mysqld wrt start of statement or Lock Tables statement.
  If "lock tables" statement had been issued, every statement may not call external_lock(),
  however handler::start_stmt() will be called instead.
  
   The trans_register_ha() is called from external_lock() not from store_lock()
    

Note:

 If you are using tables for a non-transactional storage engine, you must use LOCK TABLES if you want to ensure 
that no other thread modifies the tables between a SELECT and an UPDATE. The example shown here requires 
LOCK TABLES to execute safely:

LOCK TABLES trans READ, customer WRITE;
SELECT SUM(value) FROM trans WHERE customer_id=some_id;
UPDATE customer
  SET total_value=sum_from_previous_statement
  WHERE customer_id=some_id;
UNLOCK TABLES;

The correct way to use LOCK TABLES and UNLOCK TABLES with transactional tables, such as InnoDB tables, 
is to begin a transaction with SET autocommit = 0 (not START TRANSACTION) followed by LOCK TABLES, 
and to not call UNLOCK TABLES until you commit the transaction explicitly. When you call LOCK TABLES, 
InnoDB internally takes its own table lock, and MySQL takes its own table lock. InnoDB  releases its 
internal table lock at the next commit, but for MySQL to release its table lock, you have to call 
UNLOCK TABLES. You should not have autocommit = 1, because then InnoDB releases its internal table lock
immediately after the call of LOCK TABLES, and deadlocks can very easily happen. InnoDB does not 
acquire the internal table lock at all if autocommit = 1, to help old applications avoid unnecessary 
deadlocks. 

FLUSH TABLES WITH READ LOCK;
START TRANSACTION;
SELECT ... ;
UNLOCK TABLES;


Note: Mutex Locks Used in MySQL:

FAQ Mutex

Mutex                     Description 
LOCK_connection_count   - e.g. used during new connection.
LOCK_thread_count       - e.g. used during new connection.

create connection Logic:
    accept socket
    thd->thread_id = thread_id++; // user defined thread id from 1.
    Global variable:  thread_count++; connection_count++; 
    Default max_connection is 150 !
    pthread_create :  handle_one_connection

=======================================================================
FAQ Global Variables:
=======================================================================

Q: How to get global open tables ?   HASH  open_cache :

Ans:
  sql_base.cc:  HASH  open_cache;   // Contains all open TABLE instances.
This hash includes all used and unused_tables of TABLE * instances.
The TABLE* unused_tables; are linked by next pointer.

=======================================================================
Q: How to get global unused open tables ?  TABLE *unused_tables;
   Or for each entry in global open_cache, check for the table for 
   which  THD* t->in_use == 0

=======================================================================

Q: How to get global open table shares ? HASH table_def_cache

  sql_base.cc:   static HASH table_def_cache; // All open TABLE_SHARE instances.
Contains both used and unused list of TABLE_SHARE* instances.
Note: The  temp table shares are not part of table_def_cache.
      They are just created for the table and assigned to table->s;

=======================================================================
Q: How to get global unused  open table shares ? HASH table_def_cache


Q:  How to differentiate tables opened by substatement/toplevel statement ?
  e.g. select * from t, (select * from s) as s ; 

  These are created and assigned:  thd->query_id; table->query_id;
  thd->query_id  == table->query_id => table used by top level statement
  
=======================================================================

Q:  Where all table instances are located ?

From Table Share To table instances :

Table Share's list of used tables if active (i.e. table->in_use!=0) : 
    table->s->used_tables  (linked list of TABLE* linked by share_next ptr)

Table Share's list of free tables if table instance is inactive :
    table->s->free_tables  (OS file may be open)

Global list of unused tables : 
    TABLE  *unused_tables  (linked by next ptr)

=======================================================================
Q:  How to get open tables per given thread ?

Ans:
   DBUG_PRINT("tcache", ("open tables:"));
   for (table= thd->open_tables; table; table= table->next)
     DBUG_PRINT("tcache", ("table: '%s'.'%s' %p", table->s->db.str,
                           table->s->table_name.str, table));
=======================================================================

Q: What is the semantics of  table_share->ref_count ?
     This is number of tables for which  "handler *share->table->file" is non-zero.
This is independent of number tables in use. i.e. THD *table->in_use is non-zero.
i.e. A table may be in use-- but the underlying handler may not be open!
     A table may be unused-- but underlying handler may be open!!

=======================================================================
Q: Can many table instances keep the same table open ?

   Yes.

Note: Each table in unused_tables has corresponding file open;
      Each such table t is part of t->s->free_tables

=======================================================================

Q: Important functions related to global tables/cache ?

bool close_cached_tables(thd, TABLE_LIST* tables, .... );
    If called with tables == 0 :
       Frees all entries in unused_tables. 
       [ does not take care of closing the hanlder table->file!!]
       Forces refresh_version++;
    else if called with specific tables != 0
       call for each table: remove_table_from_cache(thd, table->db, table->table_name);

 bool name_lock_locked_table(THD *thd, TABLE_LIST *tables)
     Given write locked table, exclusively name-lock a table.

TABLE *table_cache_insert_placeholder(THD *thd, const char*key, uint key_length);
     Used to name lock table name. Inserts temporary table in open_cache. 
     Anybody opens the same table name, has to check first with open_cache.

=======================================================================

How to get global list of threads ?
   It is global variable "threads" which is list of THD.
See Example: mysqld_list_processes() [ show processlist ]

=======================================================================


Tip: download fill_help_tables.sql from : http://dev.mysql.com/doc/

/tmp/trace6.txt : 89530 : two threads running. 
T@2 : log_handler checkpoint
T@4 : user thread mysql -u root
T@5 : user thread mysql -u root
T@3 : signal handler
T@1 : main thread vio_blocking

mysqladmin -u root shutdown;

case 2: 50998 : 2 user threads running

T@6 is handling :  mysqladmin -u root shutdown

T@6  : sql_parse.cc:
      mysql_parse() {
       dispatch_command() {
       Got shutdown command; set_eof_status; 
       close_thread_tables{
          trans_commit_stmt
       }
       kill_mysql ; // send kill signal to signal handling thread
       Protocol::end_statement
       close_thread_tables
       log_slow_statement
       free_root
       }  // exit dispatch_command()
       exit do_command()
       plugin_thdvar_cleanup{
         intern_plugin_unlock name=MyISAM, ref_count= 19 !!!
         // check this!!!
       }
       close_connection { close socket connection for this thread only; }
       one_thread_per_connection_end {
         unlink_thd {
          THD::cleanup { 
             trans_rollback 
             mysql_ha_cleanup {
                For each TABLE_LIST in thd->handler_tables_hash {
                  mysql_ha_close_table(thd, table_list){
                     // only first table in table_list is closed.
                     lock  &LOCK_open
                     close_thread_table(thd, &table_ptr in thd->handler_tables){
                       if table->needs_reopen() || thd->version!=refresh_version
                          free_cache_entry(table){
                             table_def_remove_table(table){
                               remove from table->s->used_tables or free_tables;
                               Also remove from global unused_tables
                             }
                          }
                       else { detach merge childern; 
                          table_def_unuse_table(table){
                            remove table from table->s->used_tables move to free_tables
                            prepend to global unused_tables
                          }
                       }
                     } // end close thread table
                     if it had closed older version table, 
                         call broadcast_refresh() !!!!    // Check this !!!
                     unlock &LOCK_open
                  } // end mysql_ha_close_table
                } // end for each table_list in handlers tables
             } // end mysql_ha_cleanup
          } // end THD::cleanup
          thread_count--;
          connection_count--;
          delete thd :
          ~THD(){
             ha_close_connection(){
                plugin_foreach(thd, closecon_handlerton,...){
                  // i.e. for every plugin call close connection callback;
                } 
             }
             plugin_thdvar_cleanup(this){
                cleanup_variables thd->variables;
                For each plugin in thd->lex->plugins call: intern_plugin_unlock(){
                   Remove plugin entry for this thread from lex->plugins;
                   Get internal plugin pointer pi;
                   decrement  pi->ref_count
                   set reap_needed = true  if  ref_count == 0
                   // plugins locked per table open or per connection ?
                }
                reap_plugins();
             }
             free_root();
          } // end ~THD()
         } // end unlink_thd
         broadcast COND_thread_count to indicate thread is deleted.
       } // end one_thread_per_connection_end
       thd->end_statement() { 
          lex_end(){
             plugin_unlock_list() : free lex->plugins // redundant. done already in ~THD()
          }
       }
       thd->cleanup_after_query()
     }// end mysql_parse()

T@7 : Separate kill_server_thread was invoked which executes  kill_server(0) { 
         Note: This thread is not part of global threads list.
         kill_in_progress = true; abort_loop = 1;
         close_connections(){
            wait till select_thread_in_use  upto sometime;
            close_server_sock();
            shutdown unix_sock;
            end_thr_alarm(0);
            For each non-slave thread t in global threads{
               t->killed = THD::KILL_CONNECTION;
               thread_scheduler.post_kill_notification(t) i.e. libevent_post_kill_notification{
                   libevent_kill_thd_callback() wakes up by reading a pipe thd_kill_pair[1];
                   Add to-be-killed thd t  thds_need_processing; // This is LIST of threads;
                   libevent_thread_proc() will process this request(){
                      thd->scheduler.thread_attach();
                      login_connection(thd) if not already logged in.
                      if there are immediate processing required,
                         call do_command(thd);
                   }
               }
               t->mysys_var->abort = 1;
               Broadcast current thread t's condition; // ???
               Events::deinit(); // used for CREATE EVENT statement
               end_slave();
               If there are still active threads, sleep for 2 seconds.
               Threads waiting on blocking read call from client will have
               to be terminated by closing the connection to client:
               close_connection(t, 0, 0) {
                  vio_close(thd->net.vio);
               }
            } // for each thread
            // Wait for all threads to die.
            while (thread_count)  pthread_cond_wait(&COND_thread_count,&LOCK_thread_count);
         } // end close_connections
         unireg_end() i.e.  ::clean_up(){
           stop_handle_manager();  // currently unused.
           release_ddl_log(); 
           ha_binlog_end(current_thd); // 
           mysql_bin_log.cleanup();
           servers_free();
           acl_free();
           grant_free();
           query_cache_destroy();
           table_def_free();
           hostname_cache_free();
           mdl_destroy();
           item_user_lock_free();
           lex_free();                           /* Free some memory */
           item_create_cleanup();
           set_var_free();
           free_charsets();
           ....
           backup_shutdown();
           plugin_shutdown(){
              myrg_panic;
              maria_panic;  // waits for background maria checkpoint threads to terminate
              mi_panic; // myisam 
              ...
           }
           ha_end();
         }
      } // end kill_server`

T@4    : user threads mysql -u root gets communication error when client closed.
        >my_message_sql error: error: 1159  message: 'Got timeout reading ...'
           THD::raise_condition
             free_root
         info: Got error 2 reading command from socket socket (29)
         Protocol::end_statement
        net.vio != 0 and shutdown not flagged {
                              do_command(thd){
                                my_net_read(thd->net) with  thd->variables.net_wait_timeout;
                                if the read packet is OK, dispatch_command();
                              }
                           }
                        }
                 }
        }


Problems about install/uninstall plugins in 6.0:
  -- uninstall plugin innodb;   returns immediately by marking plugin deleted.
  show engine innodb status;  ==> does not work. says no engine called innodb
  show plugins; ==> lists InnoDB as deleted.
  Terminating innodb sessions, does not immediately unload the plugin.

  mysqladmin -u root refresh; ==> hangs in innobase_end (ie hton->panic )
                innobase_close_connection => 
                hton->close_connection is not called in 6.0 if the connection
                is closed after unloading the plugin.

install plugin logic :

    mysql_install_plugin(thd, &thd->lex->comment, &thd->lex->ident){
      Open  "mysql.plugin" system table {
         TABLE_LIST  tables; // initialize this for "mysql.plugin". Allocate mdl lock request.
         table = open_ltable(thd, &tables, TL_WRITE, 0);
      }
      lock LOCK_plugin;
      // rw_lock_t LOCK_system_variables_hash; Note: multiple read locks possible: rw_rdlock(&lock);
      write lock  LOCK_system_variables_hash
      load the plugin; Initialize internal plugin struct and plugin_dll struct etc.
      Initialize plugin->sysvars as well.
      If dll is already loaded, the ref counter is just incremented.....
         ==> check, can you load the same library twice ?
    }

uninstall plugin logic :

==============================================================================

Use_of_List_in_MySQL :

Use of List's in MySQL : 

  LIST    - List defined in mysys for use in C programs.
          - Simple implementation, easy to understand, bug free.
            typedef struct st_list {
              struct st_list *prev,*next; void *data;
            } LIST;
          - Doubly linked.
          - List add at the head is efficient. Append list takes O(n) time.
          - The typename is nonstandard (all caps!), but the advantage is
            the name collision with other std libs is less likely.

  What is iterator ?
     List itself provides functions like push_front(), pop(), elements etc.
     Only when you want to examine all elements, you use iterator like 
            it = list.begin();  it->next() or *it++ ; it->remove() etc.

  sql_list.h :

    base_list : class base_list : public Sql_alloc {  
                 list_node *first,**last; int elements; .... 
                }
        - list_node is a single linked list.
        - push_back(), push_front(), pop() from front, concat(), etc takes O(1) time.
        - pop() from last takes O(N) time.
        - Derived from Sql_alloc; "new" allocates from thread specific mem_root.
        - You are supposed to always use this with base_list_iterator
        - Internally uses a static fixed end_of_list marker whose next is itself.
        - Too clever and complex implementation. 
          Can be difficult to examine elements during interactive debugging.

template  class List :public base_list { ... }
      - Almost just a dummy wrapper over base_list-- just calls 
        base_list functions after typecasting  T* to void*

template  class List_iterator :public base_list_iterator { ... }
    - Almost same as base_list_iterator. Adds overloading of ++

template  class List_iterator_fast :public base_list_iterator { ... }
    - Almost same as List_iterator. Uses next_fast() instead of next().
      As I understand, the apparent gain comes from the fact that you don't need to 
      keep track of prev pointer if you don't have to delete anything from list.
      I think that the difference should be negligible and not worth having this.

Intrusive Linked List :

   So called intrusive linked list offers just one additional thing :
   Calling  delete ptr ==> will also unlink itself from the linked list it is part of.
   This also requires that the list elements should be derived from struct ilink.
   The struct ilink contains prev and next pointers, so you need to watch out for
   name collisions with multiple inheritance.

   In general, there is too much of magic happening with too much complexity.
   IMHO, a simple generic linked list would be better.
    
   Here are the basic structures :

   class base_ilist
   {
   public:
     struct ilink *first,last;
     ...
   }

   template 
   class I_List :private base_ilist {
       dummy wrapper over base_ilist. Typecasts T* to ilink* and calls base function.
   }

   e.g.  I_List  threads;

==============================================================================

mysys_library:

   * It is all C code. Only String is in sql/ which is C++.

mysys string usage:

   * Use LEX_STRING for static strings. Saves on strlen() calls. (do we care?)
   * Use:  DYNAMIC_STRING; string magically grows as more data are appended.
   * Note: String in sql/sql_string.cc is aware of charset. It uses malloc.


=========================================================================

Schema, Catalog, Database : 
    Database contains mutliple schemas for namespace.
Each database contains system catalog schema as well as user created
schemas.

=========================================================================

We need to provide Meta Data Dictionary service.

MySQL DDL Syntax :
==================

CREATE [TEMPORARY] TABLE [IF NOT EXISTS] tbl_name
    (create_definition,...)
    [table_option] ...
    [partition_options]

Or:

CREATE [TEMPORARY] TABLE [IF NOT EXISTS] tbl_name
    [(create_definition,...)]
    [table_option] ...
    [partition_options]
    select_statement

Or:

CREATE [TEMPORARY] TABLE [IF NOT EXISTS] tbl_name
    { LIKE old_tbl_name | (LIKE old_tbl_name) }

create_definition:
    col_name column_definition
  | [CONSTRAINT [symbol]] PRIMARY KEY [index_type] (index_col_name,...)
      [index_option] ...
  | {INDEX|KEY} [index_name] [index_type] (index_col_name,...)
      [index_option] ...
  | [CONSTRAINT [symbol]] UNIQUE [INDEX|KEY]
      [index_name] [index_type] (index_col_name,...)
      [index_option] ...
  | {FULLTEXT|SPATIAL} [INDEX|KEY] [index_name] (index_col_name,...)
      [index_option] ...
  | [CONSTRAINT [symbol]] FOREIGN KEY
      [index_name] (index_col_name,...) reference_definition
  | [CONSTRAINT [symbol]] CHECK (expr)

column_definition:
    data_type [NOT NULL | NULL] [DEFAULT default_value]
      [AUTO_INCREMENT] [UNIQUE [KEY] | [PRIMARY] KEY]
      [COMMENT 'string'] [reference_definition]

data_type:
    BIT[(length)]
  | TINYINT[(length)] [UNSIGNED] [ZEROFILL]
  | SMALLINT[(length)] [UNSIGNED] [ZEROFILL]
  | MEDIUMINT[(length)] [UNSIGNED] [ZEROFILL]
  | INT[(length)] [UNSIGNED] [ZEROFILL]
  | INTEGER[(length)] [UNSIGNED] [ZEROFILL]
  | BIGINT[(length)] [UNSIGNED] [ZEROFILL]
  | REAL[(length,decimals)] [UNSIGNED] [ZEROFILL]
  | DOUBLE[(length,decimals)] [UNSIGNED] [ZEROFILL]
  | FLOAT[(length,decimals)] [UNSIGNED] [ZEROFILL]
  | DECIMAL[(length[,decimals])] [UNSIGNED] [ZEROFILL]
  | NUMERIC[(length[,decimals])] [UNSIGNED] [ZEROFILL]
  | DATE
  | TIME
  | TIMESTAMP
  | DATETIME
  | YEAR
  | CHAR[(length)]
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | VARCHAR(length)
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | BINARY[(length)]
  | VARBINARY(length)
  | TINYBLOB
  | BLOB
  | MEDIUMBLOB
  | LONGBLOB
  | TINYTEXT [BINARY]
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | TEXT [BINARY]
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | MEDIUMTEXT [BINARY]
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | LONGTEXT [BINARY]
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | ENUM(value1,value2,value3,...)
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | SET(value1,value2,value3,...)
      [CHARACTER SET charset_name] [COLLATE collation_name]
  | spatial_type

index_col_name:
    col_name [(length)] [ASC | DESC]

index_type:
    USING {BTREE | HASH | RTREE}

index_option:
    KEY_BLOCK_SIZE [=] value
  | index_type
  | WITH PARSER parser_name
  | COMMENT 'string'

reference_definition:
    REFERENCES tbl_name (index_col_name,...)
      [MATCH FULL | MATCH PARTIAL | MATCH SIMPLE]
      [ON DELETE reference_option]
      [ON UPDATE reference_option]

reference_option:
    RESTRICT | CASCADE | SET NULL | NO ACTION

table_option:
    ENGINE [=] engine_name
  | AUTO_INCREMENT [=] value
  | AVG_ROW_LENGTH [=] value
  | [DEFAULT] CHARACTER SET [=] charset_name
  | CHECKSUM [=] {0 | 1}
  | [DEFAULT] COLLATE [=] collation_name
  | COMMENT [=] 'string'
  | CONNECTION [=] 'connect_string'
  | DATA DIRECTORY [=] 'absolute path to directory'
  | DELAY_KEY_WRITE [=] {0 | 1}
  | INDEX DIRECTORY [=] 'absolute path to directory'
  | INSERT_METHOD [=] { NO | FIRST | LAST }
  | KEY_BLOCK_SIZE [=] value
  | MAX_ROWS [=] value
  | MIN_ROWS [=] value
  | PACK_KEYS [=] {0 | 1 | DEFAULT}
  | PASSWORD [=] 'string'
  | ROW_FORMAT [=] {DEFAULT|DYNAMIC|FIXED|COMPRESSED|REDUNDANT|COMPACT}
  | TABLESPACE tablespace_name
  | UNION [=] (tbl_name[,tbl_name]...)

partition_options:
    PARTITION BY
        { [LINEAR] HASH(expr)
        | [LINEAR] KEY(column_list)
        | RANGE(expr)
        | LIST(expr) }
    [PARTITIONS num]
    [SUBPARTITION BY
        { [LINEAR] HASH(expr)
        | [LINEAR] KEY(column_list) }
      [SUBPARTITIONS num]
    ]
    [(partition_definition [, partition_definition] ...)]

partition_definition:
    PARTITION partition_name
        [VALUES {LESS THAN {(expr) | MAXVALUE} | IN (value_list)}]
        [[STORAGE] ENGINE [=] engine_name]
        [COMMENT [=] 'comment_text' ]
        [DATA DIRECTORY [=] 'data_dir']
        [INDEX DIRECTORY [=] 'index_dir']
        [MAX_ROWS [=] max_number_of_rows]
        [MIN_ROWS [=] min_number_of_rows]
        [TABLESPACE [=] tablespace_name]
        [NODEGROUP [=] node_group_id]
        [(subpartition_definition [, subpartition_definition] ...)]

subpartition_definition:
    SUBPARTITION logical_name
        [[STORAGE] ENGINE [=] engine_name]
        [COMMENT [=] 'comment_text' ]
        [DATA DIRECTORY [=] 'data_dir']
        [INDEX DIRECTORY [=] 'index_dir']
        [MAX_ROWS [=] max_number_of_rows]
        [MIN_ROWS [=] min_number_of_rows]
        [TABLESPACE [=] tablespace_name]
        [NODEGROUP [=] node_group_id]

=========================================================================

CREATE VIEW Syntax

CREATE
    [OR REPLACE]
    [ALGORITHM = {UNDEFINED | MERGE | TEMPTABLE}]
    [DEFINER = { user | CURRENT_USER }]
    [SQL SECURITY { DEFINER | INVOKER }]
    VIEW view_name [(column_list)]
    AS select_statement
    [WITH [CASCADED | LOCAL] CHECK OPTION]

==========================================================================

CREATE DATABASE Syntax

CREATE {DATABASE | SCHEMA} [IF NOT EXISTS] db_name
    [create_specification] ...

create_specification:
    [DEFAULT] CHARACTER SET [=] charset_name
  | [DEFAULT] COLLATE [=] collation_name

==========================================================================

CREATE TRIGGER Syntax

CREATE
    [DEFINER = { user | CURRENT_USER }]
    TRIGGER trigger_name trigger_time trigger_event
    ON tbl_name FOR EACH ROW trigger_stmt

trigger_event := insert | update | delete

trigger_time := before | after
CREATE TRIGGER testref BEFORE INSERT ON test1
  FOR EACH ROW BEGIN
    INSERT INTO test2 SET a2 = NEW.a1;
    DELETE FROM test3 WHERE a3 = NEW.a1;  
    UPDATE test4 SET b4 = b4 + 1 WHERE a4 = NEW.a1;
  END;

=========================================================================

Last_insert_id() is SQL function which returns the last just inserted 
autoincrement value generated in this session. The return value
is session specific though other sessions may have inserted more.

=========================================================================

 ALTER TABLE Syntax

ALTER [IGNORE] TABLE tbl_name
    alter_specification [, alter_specification] ...

alter_specification:
    table_option ...
  | ADD [COLUMN] col_name column_definition
        [FIRST | AFTER col_name ]
  | ADD [COLUMN] (col_name column_definition,...)
  | ADD {INDEX|KEY} [index_name]
        [index_type] (index_col_name,...) [index_option] ...
  | ADD [CONSTRAINT [symbol]] PRIMARY KEY
        [index_type] (index_col_name,...) [index_option] ...
  | ADD [CONSTRAINT [symbol]]
        UNIQUE [INDEX|KEY] [index_name]
        [index_type] (index_col_name,...) [index_option] ...
  | ADD FULLTEXT [INDEX|KEY] [index_name]
        (index_col_name,...) [index_option] ...
  | ADD SPATIAL [INDEX|KEY] [index_name]
        (index_col_name,...) [index_option] ...
  | ADD [CONSTRAINT [symbol]]
        FOREIGN KEY [index_name] (index_col_name,...)
        reference_definition
  | ALTER [COLUMN] col_name {SET DEFAULT literal | DROP DEFAULT}
  | CHANGE [COLUMN] old_col_name new_col_name column_definition
        [FIRST|AFTER col_name]
  | MODIFY [COLUMN] col_name column_definition
        [FIRST | AFTER col_name]
  | DROP [COLUMN] col_name
  | DROP PRIMARY KEY
  | DROP {INDEX|KEY} index_name
  | DROP FOREIGN KEY fk_symbol
  | DISABLE KEYS
  | ENABLE KEYS
  | RENAME [TO] new_tbl_name
  | ORDER BY col_name [, col_name] ...
  | CONVERT TO CHARACTER SET charset_name [COLLATE collation_name]
  | [DEFAULT] CHARACTER SET [=] charset_name [COLLATE [=] collation_name]
  | DISCARD TABLESPACE
  | IMPORT TABLESPACE
  | partition_options
  | ADD PARTITION (partition_definition)
  | DROP PARTITION partition_names
  | COALESCE PARTITION number
  | REORGANIZE PARTITION partition_names INTO (partition_definitions)
  | ANALYZE PARTITION partition_names
  | CHECK PARTITION partition_names
  | OPTIMIZE PARTITION partition_names
  | REBUILD PARTITION partition_names
  | REPAIR PARTITION partition_names
  | REMOVE PARTITIONING

index_col_name:
    col_name [(length)] [ASC | DESC]

index_type:
    USING {BTREE | HASH | RTREE}

index_option:
    KEY_BLOCK_SIZE [=] value
  | index_type
  | WITH PARSER parser_name
  | COMMENT 'string'

=========================================================================

Note:

 If you specify ZEROFILL for a numeric column, MySQL automatically adds 
the UNSIGNED  attribute to the column.

Numeric data types that allow the UNSIGNED attribute also allow SIGNED. 
However, these data types are signed by default, so the SIGNED 
attribute has no effect.

SERIAL is an alias for BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE. 

=========================================================================

C++ use of template and template :
    - Both have same capacity. Use of typename is preferred.
    - Both allow class as well as built-in types as parameters.
    - typename keyword forces the following id as typename and 
      resolves ambiguity in situations like this :
             template class template_class {   
                  C::A * aobj;   // Is A static member or typename ???
             }
      Instead use:  typename C::A *aobj;

=========================================================================

What is the difference between List and List iterator ?
List supports following :
   empty();
   push_back(void *info);                      // Use default alloc
   push_back(void *info, MEM_ROOT *memroot);   // Use user alloc
   // push_front();  // does not support user alloc !!!
   remove();
   pop();
   concat();
   disjoin();
   prepand();
   public uint elements;
   swap(base_list &rhs);
   is_empty();
   first_node();
   last_node();
   delete_elements(); // calls delete on each element.
 
List iterator supports :

   void init(base_list &list_par);
   void *next(void);   // return current->info; move the pointer to next
   void *next_fast(void);  // prev not updated since no delete allowed
                           // if you start using next_fast iterator method
   sublist(base_list &ls, uint elm);  
          // At the end, ls becomes the list from current pos to the end.
   void rewind(void);
   void *replace(void *element); // set new data. return old data
   void *replace(base_list &new_list); // semantics not clear.
   void remove(void);                  // Remove current
   void after(void *element)	       // Insert element after current
   operator overloading for ++ which behaves like next();
   inline T** ref(void)      { return (T**) base_list_iterator::ref(); }


Note: current list operation does not provide method in iterator
      to push element at the front!!!
      It is possible only using List method itself!

List_iterator_fast : (i.e. read-only list iterator)
      silently ignore (or return null) the methods :
        remove(), replace(), after().
      The ++ is mapped to next_fast() not next()

Note: Ideally, there should be compile time error if remove() is 
      called on List_iterator_fast.
=====================================================================

Compare this to STL list : 

allocator_type  get_allocator () const ;
iterator        begin ();
iterator        end ();
size_type       size () const
reference       front ()
reference       back ()
void    swap (list< _Tp, _Alloc > &__x)
iterator   insert (iterator __position, const _Tp &__x)
iterator   insert (iterator __position)
void       insert (iterator __position, const _Tp *__first, const _Tp *__last)
void       insert (iterator __pos, size_type __n, const _Tp &__x)
void       _M_fill_insert (iterator __pos, size_type __n, const _Tp &__x)
void       push_front (const _Tp &__x)
void       push_front ()
void       push_back (const _Tp &__x)
void       push_back ()
iterator        erase (iterator __position)
iterator        erase (iterator __first, iterator __last)
void    clear ()
void    resize (size_type __new_size, const _Tp &__x)
void    resize (size_type __new_size)
void    pop_front ()
void    pop_back ()
list< _Tp, _Alloc > &   operator= (const list< _Tp, _Alloc > &__x)
void    assign (size_type __n, const _Tp &__val)
void    _M_fill_assign (size_type __n, const _Tp &__val)
void    splice (iterator __position, list &__x)
void    splice (iterator __position, list &, iterator __i)
void    splice (iterator __position, list &, iterator __first, iterator __last)
void    remove (const _Tp &__value)
void    unique ()
void    merge (list &__x)
void    reverse ()
void    sort ()

STL Iterator supports :
 vector iterator is typedef'ed to  value_type* !
This is random access iterator, supports ++, --, * by pointer
the implementation does not use any operator overloading !!!

 List iterator is defined as separate type.


================================================================
Note: From iterator to list, getList() is not supported in STL,
      but it is supported in Java.
================================================================

Global variables of interest:
   - my_charset_bin
   - current_thd

-----------------------------------------------------------------
Examining si_objects.cc : Backup interface for data dictionary :
Uses mdl functions.

Examining MDL functions :
Kostja's web page :
http://forge.mysql.com/people/person.php?id=2322

WL#3726: DDL locking for all metadata objects

MDL.pdf :
  The table cache entry is used as meta datalock (exclusive) aka name lock.
Note: It is not the table def cache, it is table cache!
  This cache not only serves as cache and serves as the base for Meta Data Locking system.

HASH open_cache;

Flush Tables Logic involves just incrementing global refresh_version and waiting
on COND_referesh until all old tables have been flushed.

Close_thread_table() is "flush tables" aware and links the table to unused_tables list
if and only if the refresh_version == TABLE::version ;
otherwise deletes the table and broadcasts on COND_refresh (so that flush tables
will receive it).

Note:  DDL and Rollback :
DDL Statements perform implicit commit, that is why they are not rolledback
with in a transaction after it completes successfully.


Meta Data Locking Related Bugs/Worklogs :

WL#4284 Transactional DDL locking

Bug #989        If DROP TABLE while there's an active transaction, wrong binlog order
WL#4284    Transactional DDL Locking: Implements lock until tx end.
           i.e. At the end of stmt, metalock is not released.
           The fix has no effect  for autocommit mode.

What needs to be done:
- Add a mechanism so that when we are going to alter/drop a table we not only wait for
table used in running statements but also for tables used by running transactions.

Note: Fixed after 4 years or so!

Bug #27690  Re-execution of prepared statement after table was replaced with a view crashes

Bug#25144 "replication / binlog with view breaks

WL#148 : Foreign keys for all Storage Engines
WL#3726: DDL locking for all metadata objects (include for views, stored procedures, etc)

Questions :
What is the history of problems and when they were fixed ?
  Bug 989 : Submitted: 4Aug2003 ; 
  The fix for this and WL#4284 committed on march 23rd : 
       http://lists.mysql.com/commits/70102
  committed to 6.0.11 alpha on march 31st.

The fixes: 

 23 July 2008 : http://lists.mysql.com/commits/50346
      Maintain metalocks until tx end.
 ....

02 March 2009 : http://lists.mysql.com/commits/68032
2736 Davi Arnaut        2009-03-02
 
How does backup uses mdl ?
   ddl_blocker.cc :   my_bool DDL_blocker_class::block_DDL(THD *thd)
        Implements systemwide block of DDL.
        If there is already a block, the thd waits on the queue.
   
   si_objects.cc : Obj_iterator *get_databases(THD *thd)
                 Obj_iterator *get_db_tables(THD *thd, const String *db_name)
        Runs SQL to get the information from INFORMATION schemata !!!
Note: These listing functions are OK to run SQL.
      However, we can't afford to do this for specific table for AQT ???

May 11, 2009 :
  Notes on aqt_db_obj.h :

class  Database : public SQL_object {
  virtual  int get_database_name(String *dbname); /*< Get database name */
  virtual  List *get_table_names();   /*< Get the list of tables*/
  virtual  List *get_view_names();    /*< Get the list of views */
};

Note: This AQT object returns List for get_table_names() and
      get_view_names() instead of iterators. 
      This is to avoid database level locks.


SQL Mode :
   Controls the syntax of printing the query
SELECT @@GLOBAL.sql_mode;
SELECT @@SESSION.sql_mode;

Modes      Value
ANSI       REAL_AS_FLOAT, PIPES_AS_CONCAT, ANSI_QUOTES, IGNORE_SPACE.
DB2        PIPES_AS_CONCAT, ANSI_QUOTES, IGNORE_SPACE, NO_KEY_OPTIONS, 
           NO_TABLE_OPTIONS, NO_FIELD_OPTIONS.
MAXDB      PIPES_AS_CONCAT, ANSI_QUOTES, IGNORE_SPACE, NO_KEY_OPTIONS, 
           NO_TABLE_OPTIONS, NO_FIELD_OPTIONS, NO_AUTO_CREATE_USER.
MSSQL      PIPES_AS_CONCAT, ANSI_QUOTES, IGNORE_SPACE, NO_KEY_OPTIONS, 
           NO_TABLE_OPTIONS, NO_FIELD_OPTIONS.
MYSQL323   NO_FIELD_OPTIONS, HIGH_NOT_PRECEDENCE.
MYSQL40    NO_FIELD_OPTIONS, HIGH_NOT_PRECEDENCE.
ORACLE     PIPES_AS_CONCAT, ANSI_QUOTES, IGNORE_SPACE, NO_KEY_OPTIONS, 
           NO_TABLE_OPTIONS, NO_FIELD_OPTIONS, NO_AUTO_CREATE_USER.
....

How to findout auto_increment=start_value table option ?
   It is stored in storage engine not as part of table definition.
Hence, you need to partially fill create_info, then do:

file->update_create_info(&create_info) {
   handler::info(...);
   create_info.auto_increment_value = stats.auto_increment_value;
}
Note: Auto increment field is marked with: 
     field->unireg_check == Field::NEXT_NUMBER 

Note: This means, if the auto_increment is not supported by storage
engine, the show create table also won't show this.

How to display all info about table ?
 mysqld_show_create(){

    store_create_info(){ 

      // Retrieve default values to records[0]
      restore_record(table, share->default_values);
      // Note table is of type TABLE*; Though TABLE_SHARE contains default_values 
      // record contents, we need a TABLE* element, so that we can call
      // table->field->val_int(str) etc methods.
      // If I remember correctly, share->field->val_int(str) won't work,
      // (because share->field->table is usually NULL?)
      For each field in table->field array do {

        print field->field_name
        field->sql_type(type_str);  print type_str;
        uint flags = field->flags;
        flags & NOT_NULL_FLAG  ==> It is non nullable
        field->unireg_check ==> specifies timestamp|auto_increment|
                                 blob|bit, etc.

        if storage_mask bit set in flags print "storage disk|memory";
        if column_format bit set, print "/*! version fixed|dynamic"
        get_field_default_value(thd, table->timestamp_field, field,
                      &def_value, 1);
        /** How default is obtained ? For example:
               tmp_use_all_columns(table, table->read_set);
               field->val_int(str); // field ptr is pointing to def value.;
         */
        /* Field *timestamp_field; Atmost one field can be timestamp */
        print field->comment.str;
     }

     Note :In general, if the same info available in "share" use them;
     otherwise use the info from file handler.

     Print row_type as fixed|dynamic|compressed etc from share->row_type
     /* Print checksum from share->page_checksum */
     share ->db_create_options specifies flags for :
             pack_keys|checksum|delay_keywrite| etc.
     file->update_create_info(&create_info){
       Retrieves info such as avg_row_length|auto_increment_value|
           table_options :- pack_keys|pack_record|delay_keywrite|etc.
     }

     /* use share->key_info or table->key_info ?  */
     For each key from table->key_info do {
       keys present only for: primary|unique|fulltext|spatial|multiple
       For each key_part in key_info->key_part array print key_part:
         e.g. (i,j,k) or (a,b(5),c); /* if only b(5) used for index */
         print length only if key_part->length!=field->key_length()
     }

     /* Foreign key is obtained from storage engine. */
     str = file->get_foreign_key_create_info(); Print str;

   }

=================================================================

How the value of field is read ?

longlong Field_short::val_int(void)
{
  ASSERT_COLUMN_MARKED_FOR_READ;
  short j;
#ifdef WORDS_BIGENDIAN
  if (table->s->db_low_byte_first)
    j=sint2korr(ptr);
  else  
#endif
    shortget(j,ptr);
  return unsigned_flag ? (longlong) (unsigned short) j : (longlong) j;
}

=================================================================
Note: Check constraint is parsed but ignored by all storage engines!!!

=================================================================


enum  enum_field_type
String

=================================================================
C++ tricks :
How to allocate array of pointers in C++ using new :
toy **tPoint; // a pointer to a pointer to an object from the "toy" class
tPoint = new toy *[33]; // use as an array of 33 pointers to "toy" objects
for (i = 0; i < 5; i++) tPoint[i] = new toy;
=================================================================

FAQ:  How does killing of the thread work ?
sql_kill(){
 kill_one_thread(calling_thd, thd_id_to_kill, only_kill_query){
   Iterate through global threads and choose THD *tmp with matching id;
   [ Use LOCK_thread_count for iterating ]
   Do privilege check using thd->security_ctx and tmp->security_ctx;
   Lock  tmp->LOCK_thd_data;
   tmp->awake(kill query or connection)
   {
     /* Remove thread from alarm queue. why ??? */
     if connection has to be terminated, close the socket;
     mysys_var is a thread specific variable;

     Lock  mysys_var->mutex; 
     /* Note:  Is it needed ? we already have LOCK_thd_data !!! 
        Looks like mysys_var is protected by it's own mutex as well!
        So one get get to mysys_var by locking LOCK_thd_data and 
        mysys_var->mutex, and then release LOCK_thd_data ???
      */
     If it is non-system thread, set mysys_var->abort= 1; // abort locks.

     Check if this thread is currently waiting on a mutex and condition.
     Loop (for few times)
     i.e. if (mysys_var->current_cond && mysys_var->current_mutex)
     {
       // Victim thread is currently doing something like:
       //     pthread_cond_wait(&COND_refresh, &LOCK_open);
       // Now send a broadcast msg to terminate this condition wait.
       Try locking current_mutex;
       pthread_cond_broadcast(mysys_var->current_cond);
       unlock current_mutex (if locked already);
       //
       //  Note on avoiding deadlock: Another thread may do: 
       //    lock mutex; lock current_mutex; unlock mutex;
       //  So, to avoid deadlock, don't block on lock current_mutex here.
       //
     }
     Note: current_cond and current_mutex will not become NULL 
     while we do this because we hold mysys_var->mutex.
     The victim thread may not receive the broadcast if it is 
     already out of the condition wait. That's OK, because later
     it will check thd->killed flag.

     // Unlock mysys var mutex now.
     pthread_mutex_unlock(&mysys_var->mutex);
   }
   Unlock  tmp->LOCK_thd_data;
 }
}

FAQ:  How does MyISAM kill the locking thread ? 

myisam_backup_engine.cc :

Backup::kill_locking_thread()   // Backup driver's method.
{
   lock THR_LOCK_myisam;
   if lock_state is already started or acquired
   {
     if lock_thd is null, i.e. not yet started, wait until it starts;
     // Note: This thread synchronizes specifically with  
     //     Backup::lock_tables_TL_READ_NO_INSERT() thread.
     // We want to do:  lock_thd->awake(kill);
     //   However, we can't do that before releasing THR_LOCK_myisam
     // If we release THR_LOCK_myisam, there is theoritical possibility
     // that 2 parallel calls of awake() could work on same thread.
     //
     //    How to be sure, lock_thd is not deleted until we call awake?
     //  Use another flag:
     //     set a flag cannot_delete_lock_thd= TRUE;
     // 
     cannot_delete_lock_thd= TRUE;
     unlock THR_LOCK_myisam;

     Lock lock_thd->LOCK_thd_data;  lock_thd->awake(); Unlock;
     
     // Now allow lock_thd to be deleted.
     Lock THR_LOCK_myisam;  
     cannot_delete_lock_thd= FALSE;
     // Wakeup thread if it was blocked on bool above.
     Wakeup lock thread on  COND_lock_state; // Backup driver condition.
     // The locking thread communicates back to killing thread
     // on same condition. So wait for thread death!!!
     while (lock_state != LOCK_ERROR)
         Wait on COND_lock_state with THR_LOCK_myisam;
     unlock THR_LOCK_myisam;
   }
}

String FAQ : 
   - Defined in sql_string.h
   - class String { char * Ptr; ...; charset; ... }
   - Memory allocated by my_malloc();
   - 
==============================================================================