package genetic_code_pak;			-- collection of genetic code utilities, including unigene analysis

	const genetic_code := {["TTT","Phe","F"], ["TTC","Phe","F"], ["TTA","Leu","L"], ["TTG","Leu","L"], ["TCT","Ser","S"],
	 ["TCC","Ser","S"], ["TCA","Ser","S"], ["TCG","Ser","S"], ["TAT","Tyr","Y"], ["TAC","Tyr","Y"], ["TAA","*","\r"],
	  ["TAG","*","\r"], ["TGT","Cys","C"], ["TGC","Cys","C"], ["TGA","*","\r"], ["TGG","Trp","W"], ["CTT","Leu","L"],
	 ["CTC","Leu","L"], ["CTA","Leu","L"], ["CTG","Leu","L"], ["CCT","Pro","P"], ["CCC","Pro","P"], ["CCA","Pro","P"],  
	 ["CCG","Pro","P"], ["CAT","His","H"], ["CAC","His","H"], ["CAA","Gln","Q"], ["CAG","Gln","Q"], ["CGT","Arg","R"], 
	 ["CGC","Arg","R"], ["CGA","Arg","R"], ["CGG","Arg","R"], ["ATT","Ile","I"], ["ATC","Ile","I"], ["ATA","Ile","I"], 
	 ["ATG","Met","M"], ["ACT","Thr","T"], ["ACC","Thr","T"], ["ACA","Thr","T"], ["ACG","Thr","T"], ["AAT","Asn","N"], 
	 ["AAC","Asn","N"], ["AAA","Lys","K"], ["AAG","Lys","K"], ["AGT","Ser","S"], ["AGC","Ser","S"], ["AGA","Arg","R"], 
	 ["AGG","Arg","R"], ["GTT","Val","V"], ["GTC","Val","V"], ["GTA","Val","V"], ["GTG","Val","V"], ["GCT","Ala","A"], 
	 ["GCC","Ala","A"], ["GCA","Ala","A"], ["GCG","Ala","A"], ["GAT","Asp","D"], ["GAC","Asp","D"], ["GAA","Glu","E"], 
	 ["GAG","Glu","E"], ["GGT","Gly","G"], ["GGC","Gly","G"], ["GGA","Gly","G"], ["GGG","Gly","G"]};
 
	 const complement := {["a","t"],["t","a"],["c","g"],["g","c"],["A","T"],["T","A"],["C","G"],["G","C"]};

--		Oily: AGVLIPPMWC Hydrocarbon or SH, one nitrogen in ring 	(nonpairing)
--		Polar: NQSTV CONH2 or OH 									(self-pairing)
--		Pos:  RKH	NH2 or double N in ring							(pairs with complement)
--		Neg: DE		COOH											(pairs with complement)

	const protein_quality := {["A","h"],["G","h"],["V","h"],["L","h"],["I","h"],["P","h"],["F","h"],["M","h"],
			["W","h"],["C","h"],["N","n"],["Q","n"],["S","n"],["T","n"],["Y","n"],
			["R","p"],["K","p"],["H","p"],["D","a"],["E","a"]};
							  -- h = hydrocarbon, a = acid, b = basic, n = polar neutral, u = unknown

									-- paths to files used and written
	var bioinf_course_prefix := "Diana:Diana2003:Pub:FromGiuseppeLATEST:SetlFolder:bioinformatics_course:";
	var to_genome_lengths_prefix := bioinf_course_prefix + "unigene:Unigene_analysis_and_psls:genome_lengths:";
	var to_raw_unigene_prefix := bioinf_course_prefix + "unigene:";
	var gb_active_ix_flat := OM;					-- the index flat to the genbank active file 
	var gb_active_file_handle := OM;				-- handle for the genbank active file

	-- ***** Legend for restiction enymes data

		-- 3.2. Purine (adenine or guanine): R
		-- 3.3. Pyrimidine (thymine or cytosine): Y
		-- 3.4. Adenine or thymine: W
		-- 3.5. Guanine or cytosine: S
		-- 3.6. Adenine or cytosine: M
		-- 3.7. Guanine or thymine: K
		-- 3.8. Adenine or thymine or cytosine: H
		-- 3.9. Guanine or cytosine or thymine: B
		-- 3.10. Guanine or adenine or cytosine: V
		-- 3.11. Guanine or adenine or thymine: D
		-- 3.12. Guanine or adenine or thymine or cytosine: N
	
	const base_weights := {["A",329.2],["T",306.2],["U",306.2],["C",305.2],["G",345.2],["N",329.2]};
	const peptide_weights := {["A",129.1],["R",156.2],["N",114.1],["D",115.1],
			["C",103.1],["Q",128.1],["E",129.1],["G",57.1],
			["H",137.1],["I",113.1],["L",113.1],["K",128.2],["M",131.2],["F",147.2],
			["P",97.1],["S",87.1],["T",101.1],["W",186.2],["Y",163.2],["V",99.1],["B",132.6],["Z",146.6]};
	
	const restriction_enzymes_data := {["Aar_I", "AGGCCTN4^N4"], ["Aas_I", "GACNNNN^NNGTC"], ["Aat_I", "AGG^CCT"], ["Aat_II", "GACGT^C"],
		["Aau_I", "T^GTACA"], ["Acc_I", "GT^MKAC"], ["Acc_II", "CG^CG"], ["Acc_III", "T^CCGGA"],
		["Acc16_I", "TGC^GCA"], ["Acc36_I", "ACCTGC(N)4^(N)4"], ["Acc65_I", "G^GTSCC"], ["Acc113_I", "HGT^ACT"],
		["AccB1_I", "G^GYRCC"], ["AccB7_I", "CCA(N)4^NTGG"], ["AccBS_I", "CCG^CTC"], ["Aci_I", "C^CGC"],
		["Acl_I", "AA^CGTT"], ["AclW_I", "GGATC(N)4^"], ["Acs_I", "R^AATTY"], ["Acu_I", "CTGAAG(N)16^"],
		["Acv_I", "CAC^GTG"], ["Acy_I", "GR^CGYC"], ["Ade_I", "CACNNN^GTG"], ["Afa_I", "GT^AC"],
		["Afe_I", "HGC^GCT"], ["Afl_II", "C^TTSHG"], ["Afl_III", "A^CRYGT"], ["Age_I", "A^CCGGT"],
		["Ahd_I", "GACNNN^NNGTC"], ["Ahl_I", "A^CTAGT"], ["Ale_I", "CACNN^NNGTG"], ["Alo_I", "(N)5^(N)7GAAC(N)6TCC(N)12^"],
		["Alu_I", "HG^CT"], ["Alw_I", "GGATC(N)4^"], ["Alw21_I", "GWCCW^C"], ["Alw26_I", "GTCTCN^"],
		["Alw44_I", "G^TGCAC"], ["AlwN_I", "CHGNNN^CTG"], ["Ama87_I", "C^YCGRG"], ["Aor51H_I", "HGC^GCT"],
		["Apa_I", "GGGCC^C"], ["ApaL_I", "G^TGCAC"], ["Apo_I", "R^AATTY"], ["Asc_I", "GG^CGCGCC"],
		["Ase_I", "AT^TSAT"], ["AsiA_I", "A^CCGGT"], ["AsiS_I", "GCGAT^CGC"], ["Asp_I", "GACN^NNGTC"],
		["Asp700_I", "GAANN^NNTTC"], ["Asp718_I", "G^GTSCC"], ["AspA2_I", "C^CTAGG"], ["AspE_I", "GACNNN^NNGTC"],
		["AspH_I", "GWGCW^C"], ["AspLE_I", "GCG^C"], ["AspS9_I", "G^GNCC"], ["Asu_II", "TT^CGAA"],
		["AsuC2_I", "CC^SGG"], ["AsuHP_I", "GGTGA(N)8^"], ["AsuNH_I", "G^CTHGC"], ["Ava_I", "C^YCGRG"],
		["Ava_II", "G^GWCC"], ["Avi_II", "TGC^GCA"], ["Avr_II", "C^CTHGG"], ["Axy_I", "CC^TNHGG"],
		["Bae_I", "^(N)10AC(N)4"], ["Bal_I", "TGG^CCA"], ["BamH_I", "G^GATCC"], ["Ban_I", "G^GYRCC"],
		["Ban_II", "GRGCY^C"], ["Ban_III", "AT^CGAT"], ["Bbe_I", "GGCGC^C"], ["BbrP_I", "CAC^GTC"],
		["Bbs_I", "GAHGACNN^"], ["Bbu_I", "GCATG^C"], ["Bbv_I", "GCHGC(N)8^"], ["Bbv12_I", "GWGCW^C"],
		["BbvC_I", "CC^TCHGC"], ["Bcc_I", "CCATCNNNN^N"], ["BceA_I", "ACGGC(N)12^(N)2"], ["Bcg_I", "^(N)10CGA(N)6TGC(N)12^"],
		["BciV_I", "GTSTCC(N)6^"], ["Bcl_I", "T^GATCA"], ["Bcn_I", "CC^SGG"], ["Bcu_I", "A^CTHGT"],
		["Bfa_I", "C^THG"], ["Bfi_I", "ACTGGG(N)5^"], ["Bfm_I", "C^TRYHG"], ["Bfr_I", "C^TTAAG"],
		["BfrB_I", "ATG^CAT"], ["Bfu_I", "GTATCC(N)6^"], ["BfuA_I", "ACCTGCNNNN^NNNN"], ["BfuC_I", "^GATC"],
		["Bgl_I", "GCC(N)4^NGGC"], ["Bgl_II", "A^GATCT"], ["Bln_I", "C^CTHGG"], ["Blp_I", "GC^TNHGC"],
		["Bme18_I", "G^GWCC"], ["Bme1390_I", "CC^NGG"], ["Bme1580_I", "GKGCM^C"], ["BmgB_I", "GAC^GTC"],
		["Bmr_I", "ACTGGG"], ["Bmt_I", "GCTAG^C"], ["Bmy_I", "GDGCH^C"], ["Box_I", "GACNN^NNGTC"],
		["Bpi_I", "GAHGACNN^"], ["Bpl_I", "^(N)8GHG(N)5CTC(N)13^"], ["Bpm_I", "CTGGHG(N)16^"], ["Bpu10_I", "CC^TNHGC"],
		["Bpu14_I", "TT^CGAA"], ["Bpu1102_I", "GC^TNHGC"], ["BpuA_I", "GAHGACNN^"], ["BpuE_I", "CTTGAG(N)16^"],
		["Bsa_I", "GGTCTCN^"], ["Bsa29_I", "AT^CGAT"], ["BsaA_I", "YAC^GTR"], ["BsaB_I", "GATNN^NNATC"],
		["BsaH_I", "GR^CGYC"], ["BsaJ_I", "C^CNNGG"], ["BsaM_I", "GAATGCN^"], ["BsaO_I", "CGRY^CG"],
		["BsaW_I", "W^CCGGW"], ["BsaX_I", "(N)3(N)9AC(N)5CTCC(N)10^"], ["Bsc_I", "AT^CGAT"], ["Bsc4_I", "CC(N)5^NNGG"],
		["Bse1_I", "ACTGGN^"], ["Bse3D_I", "GCAATGNN^"], ["Bse8_I", "GATNN^NNATC"], ["Bse21_I", "CC^TNHGG"],
		["Bse118_I", "R^CCGGY"], ["BseA_I", "T^CCGGA"], ["BseB_I", "CCWGG"], ["BseC_I", "AT^CGAT"],
		["BseD_I", "C^CNNGG"], ["BseG_I", "GGATGNN^"], ["BseJ_I", "GATNN^NNATC"], ["BseL_I", "CC(N)5^NNGG"],
		["BseM_I", "GCAATGNN^"], ["BseM_II", "CTCHG(N)10^"], ["BseN_I", "ACTGGN^"], ["BseP_I", "G^CGCGC"],
		["BseR_I", "GHGGHG(N)10^"], ["BseS_I", "GKGCM^C"], ["BseX_I", "GCHGC(8/12)"], ["BseX3_I", "C^GGCCG"],
		["BseY_I", "C^CCAGC"], ["Bsg_I", "GTGCHG(N)16^"], ["Bsh1236_I", "CG^CG"], ["Bsh1285_I", "CGRY^CG"],
		["Bsh1365_I", "GATNN^NNATC"], ["BshF_I", "GG^CC"], ["BshN_I", "G^GYRCC"], ["BshT_I", "A^CCGGT"],
		["BsiB_I", "GATNN^NNATC"], ["BsiE_I", "CGRY^CG"], ["BsiHKA_I", "GWGCW^C"], ["BsiHKC_I", "C^YCGRG"],
		["BsiM_I", "T^CCGGA"], ["BsiS_I", "C^CGG"], ["BsiW_I", "C^GTSCG"], ["BsiY_I", "CC(N)5^NNGG"],
		["BsiZ_I", "AT^CGAT"], ["Bsl_I", "CC(N)5^NNGG"], ["BslF_I", "^(N)11GGGAC(N)10^"], ["Bsm_I", "GAATGCN^"],
		["BsmA_I", "GTCTCN^"], ["BsmB_I", "CGTCTCN^"], ["BsmF_I", "GGGAC(N)10^"], ["Bso31_I", "GGTCTCN^NNNN"],
		["BsoB_I", "C^YCGRG"], ["BsoMA_I", "GTCTCN^(N)4"], ["Bsp13_I", "T^CCGGA"], ["Bsp19_I", "C^CATGG"],
		["Bsp68_I", "TCG^CGA"], ["Bsp106_I", "AT^CGAT"], ["Bsp119_I", "TT^CGAA"], ["Bsp120_I", "G^GGCCC"],
		["Bsp143_I", "^GATC"], ["Bsp143_II", "RGCGC^Y"], ["Bsp1286_I", "GDGCH^C"], ["Bsp1407_I", "T^GTACA"],
		["Bsp1720_I", "GC^TNAGC"], ["BspAN_I", "GG^CC"], ["BspC_I", "CGAT^CG"], ["BspCN_I", "CTCAG(N)9^"],
		["BspD_I", "AT^CGAT"], ["BspE_I", "T^CCGGA"], ["BspH_I", "T^CATGA"], ["BspL_I", "GGN^NCC"],
		["BspLU11_I", "A^CATGT"], ["BspM_I", "ACCTGC(N)4^"], ["BspM_I", "CTGCA^G"], ["BspP_I", "GGATC(N)4^"],
		["BspT_I", "C^TTAAG"], ["BspT104_I", "TT^CGAA"], ["BspT107_I", "G^GYRCC"], ["BspTN_I", "GGTCTN^NNNN"],
		["BspX_I", "AT^CGAT"], ["Bsr_I", "ACTGGN^"], ["BsrB_I", "CCG^CTC"], ["BsrD_I", "GCAATGNN^"],
		["BsrF_I", "R^CCGGY"], ["BsrG_I", "T^GTACA"], ["BsrS_I", "ACTGGN^"], ["BssA_I", "R^CCGGY"],
		["BssEC_I", "C^CNNGG"], ["BssH_I", "C^TCGAG"], ["BssH_II", "G^CGCGC"], ["BssK_I", "^CCNGG"],
		["BssNA_I", "GTS^TSC"], ["BssS_I", "C^ACGHG"], ["BssT1_I", "C^CWWGG"], ["Bst2B_I", "C^ACGHG"],
		["Bst2U_I", "CC^WGG"], ["Bst4C_I", "CAN^GT"], ["Bst6_I", "CTCTTCN^NNN"], ["Bst7l_I", "GCHGC(N)8^"],
		["Bst98_I", "C^TTSHG"], ["Bst1107_I", "GTS^TSC"], ["BstAC_I", "GR^CGYC"], ["BstAP_I", "GCA(N)4^NTGC"],
		["BstAU_I", "T^GTACA"], ["BstB_I", "TT^CGAA"], ["BstBA_I", "YAC^GTR"], ["BstC8_I", "GCN^NGC"],
		["BstDE_I", "C^TNHG"], ["BstDS_I", "C^CRYGG"], ["BstE_II", "G^GTNACC"], ["BstEN_I", "CCTNN^NNNAGG"],
		["BstEN_II", "^GATC"], ["BstF5_I", "GGATGNN^"], ["BstFN_I", "CG^CG"], ["BstH2_I", "RGCGC^Y"],
		["BstHH_I", "GCG^C"], ["BstHP_I", "GTT^AAC"], ["BstKT_I", "GAT^C"], ["BstMA_I", "CTGCA^G"],
		["BstMB_I", "^GATC"], ["BstMC_I", "CGRY^CG"], ["BstMW_I", "GCNNNNN^NNGC"], ["BstN_I", "CC^WGG"],
		["BstNS_I", "RCATG^Y"], ["BstO_I", "CC^WGG"], ["BstP_I", "G^GTNACC"], ["BstPA_I", "GACNN^NNGTC"],
		["BstSC_I", "^CCNGG"], ["BstSF_I", "C^TRYHG"], ["BstSN_I", "TSC^GTS"], ["BstU_I", "CG^CG"],
		["BstV1_I", "GCAGC(N)8^(N)4"], ["BstV2_I", "GAAGACNN^NNNN"], ["BstX_I", "CCA(N)5^NTGG"], ["BstX2_I", "R^GATCY"],
		["BstY_I", "R^GATCY"], ["BstZ_I", "C^GGCCG"], ["BstZ17_I", "GTS^TSC"], ["Bsu15_I", "AT^CGAT"],
		["Bsu36_I", "CC^TNHGG"], ["BsuR_I", "GG^CC"], ["BsuTU_I", "AT^CGAT"], ["Btg_I", "CCR^YGG"],
		["Btr_I", "CAC^GTC"], ["Bts_I", "GCHGTGNN^"], ["Bve_I", "ACCTGC(N)4^(N)4"], ["Type_II", "Restriction"],
		["Enzymes_Sequence", "Restriction"], ["Cac8_I", "GCN^NGC"], ["Cai_I", "CHGNNN^CTG"], ["CciN_I", "GC^GGCCGC"],
		["Cel_II", "GC^TNHGC"], ["Cfo_I", "GCG^C"], ["Cfr_I", "Y^GGCCR"], ["Cfr9_I", "C^CCGGG"],
		["Cfr10_I", "R^CCGGY"], ["Cfr13_I", "G^GNCC"], ["Cfr42_I", "CCGC^GG"], ["Cla_I", "AT^CGAT"],
		["Cpo_I", "CGGWC^CG"], ["Csp_I", "CG^GWCCG"], ["Csp6_I", "G^TSC"], ["Csp45_I", "TT^CGAA"],
		["CspA_I", "A^CCGGT"], ["CviA_II", "C^ATG"], ["CviJ_I", "RG^CY"], ["CviR_I", "TG^CA"],
		["CviT_I", "RG^CY"], ["Cvn_I", "CC^TNHGG"], ["Dde_I", "C^TNHG"], ["Dpn_I", "GA^TC"],
		["Dpn_II", "^GATC"], ["Dra_I", "TTT^AAA"], ["Dra_II", "RG^GNCCY"], ["Dra_III", "CACNNN^GTG"],
		["Drd_I", "GAC(N)4^NNGTC"], ["DseD_I", "GAC(N)4^NNGTC"], ["Eae_I", "Y^GGCCR"], ["Eag_I", "C^GGCCG"],
		["Eam1104_I", "CTCTTCN^"], ["Eam1105_I", "GACNNN^NNGTC"], ["Ear_I", "CTCTTCN^"], ["Eci_I", "GGCGGA(N)11^"],
		["Ecl136_II", "GHG^CTC"], ["EclHK_I", "GACNNN^NNGTC"], ["EclX_I", "C^GGCCG"], ["Eco24_I", "GRGCY^C"],
		["Eco31_I", "GGTCTCN^"], ["Eco32_I", "GAT^ATC"], ["Eco47_I", "G^GWCC"], ["Eco47_III", "HGC^GCT"],
		["Eco52_I", "C^GGCCG"], ["Eco57_I", "CTGAAG(N)16^"], ["Eco57M_I", "CTGRAG16^"], ["Eco72_I", "CAC^GTG"],
		["Eco81_I", "CC^THGG"], ["Eco88_I", "C^YCGRG"], ["Eco91_I", "G^GTNACC"], ["Eco105_I", "TSC^GTS"],
		["Eco130_I", "C^CWWGG"], ["Eco147_I", "HGG^CCT"], ["EcoICR_I", "GHG^CTC"], ["EcoN_I", "CCTNN^NNNHGG"],
		["EcoO65_I", "G^GTNACC"], ["EcoO109_I", "RG^GNCCY"], ["EcoR_I", "G^AATTC"], ["EcoR_II", "^CCWGG"],
		["EcoR_V", "GAT^ATC"], ["EcoT14_I", "C^CWWGG"], ["EcoT22_I", "ATGCA^T"], ["EcoT38_I", "GRGCY^C"],
		["Ege_I", "GGC^GCC"], ["Ehe_I", "GGC^GCC"], ["Erh_I", "C^CWWGG"], ["Esp3_I", "CGTCTCN^"],
		["Fal_I", "(N)5^(N)8AAG(N)5CTT(N)13^"], ["Fat_I", "^CATG"], ["Fau_I", "CCCGC(N)4^"], ["FauND_I", "CA^TSTG"],
		["Fba_I", "T^GATCA"], ["Fbl_I", "GT^MKAC"], ["Fnu4H_I", "GC^NGC"], ["Fok_I", "GGATG(N)9^"],
		["FriO_I", "GRGCY^C"], ["Fse_I", "GGCCGG^CC"], ["Fsp_I", "TGC^GCA"], ["Fsp4H_I", "GC^NGC"],
		["FspA_I", "RTGC^GCAY"], ["Fun_I", "AGC^GCT"], ["Fun_II", "G^AATTC"], ["Gsu_I", "CTGGHG(N)16^"],
		["Hae_II", "RGCGC^Y"], ["Hae_III", "GG^CC"], ["Hap_II", "C^CGG"], ["Hga_I", "GACGC(N)5^"],
		["Hha_I", "GCG^C"], ["Hin1_I", "GR^CGYC"], ["Hin4_I", "(N)5^(N)8GAY(N)5VTC(N)13^"], ["Hin6_I", "G^CGC"],
		["Hinc_II", "GTY^RAC"], ["Hind_II", "GTY^YAC"], ["Hind_III", "A^HGCTT"], ["Hinf_I", "G^ANTC"],
		["HinP1_I", "G^CGC"], ["Hpa_I", "GTT^AAC"], ["Hpa_II", "C^CGG"], ["Hph_I", "GGTGA(N)8^"],
		["Hpy8_I", "GTN^NAC"], ["Hpy99_I", "CGWCG^"], ["Hpy188_I", "TCN^GA"], ["Hpy188_III", "TC^NNGA"],
		["HpyCH4_III", "ACN^GT"], ["HpyCH4_IV", "A^CGT"], ["HpyCH4_V", "TG^CA"], ["HpyF10_VI", "GC(N)6^NGC"],
		["Hsp92_I", "GR^CGYC"], ["Hsp92_II", "CATG^"], ["HspA_I", "G^CGC"], ["Ita_I", "GC^NGC"],
		["Kas_I", "G^GCGCC"], ["Kpn_I", "GGTSC^C"], ["Kpn2_I", "T^CCGGA"], ["Ksp_I", "CCGC^GG"],
		["Ksp22_I", "T^GATCA"], ["Ksp632_I", "CTCTTCN^"], ["KspA_I", "GTT^AAC"], ["Kzo9_I", "^GATC"],
		["Lsp_I", "TT^CGAA"], ["Lwe_I", "GCATC(N)5^(N)4"], ["Mab_I", "A^CCWGGT"], ["Mae_I", "C^TAG"],
		["Mae_II", "A^CGT"], ["Mae_III", "^GTNAC"], ["Mam_I", "GATNN^NNATC"], ["Mbi_I", "CCG^CTC"],
		["Mbo_I", "^GATC"], ["Mbo_II", "GAHGA(N)8^"], ["Mfe_I", "C^AATTG"], ["Mfl_I", "R^GATCY"],
		["Mhl_I", "GDGCH^C"], ["Mls_I", "TGG^CCA"], ["Mlu_I", "A^CGCGT"], ["MluN_I", "TGG^CCA"],
		["Mly_I", "GHGTCN5^"], ["Mly113_I", "GG^CGCC"], ["Mme_I", "TCCRAC(N)20^"], ["Mnl_I", "CCTC(N)7^"],
		["Mph1103_I", "ATGCA^T"], ["Mro_I", "T^CCGGA"], ["MroN_I", "G^CCGGC"], ["MroX_I", "GAANN^NNTTC"],
		["Msc_I", "TGG^CCA"], ["Mse_I", "T^TAA"], ["Msl_I", "CAY(N)4RTG"], ["Msp_I", "C^CGG"],
		["Msp17_I", "GR^CGYC"], ["Msp20_I", "TGG^CCA"], ["MspA1_I", "CMG^CKG"], ["MspC_I", "C^TTSHG"],
		["MspR9_I", "CC^NGG"], ["Mss_I", "GTTT^AAAC"], ["Mun_I", "C^AATTG"], ["Mva_I", "CC^WGG"],
		["Mva1269_I", "GAATGCN^"], ["Mvn_I", "CG^CG"], ["Mwo_I", "GC(N)5^NNGC"], ["Nae_I", "GCC^GGC"],
		["Nar_I", "GG^CGCC"], ["Nci_I", "CC^SGG"], ["Nco_I", "C^CATGG"], ["Nde_I", "CA^TSTG"],
		["Nde_II", "^GATC"], ["NgoM_IV", "G^CCGGC"], ["Nhe_I", "G^CTHGC"], ["Nla_III", "CATG^"],
		["Nla_IV", "GGN^NCC"], ["NmuC_I", "^GTSAC"], ["Not_I", "GC^GGCCGC"], ["Nru_I", "TCG^CGA"],
		["NruG_I", "GACNNN^NNGTC"], ["Nsb_I", "TGC^GCA"], ["Nsi_I", "ATGCA^T"], ["Nsp_I", "RCATG^Y"],
		["Nsp_III", "C^YCGRG"], ["Nsp_V", "TT^CGAA"], ["Oli_I", "CACNN^NNGTG"], ["Pac_I", "TTSAT^TSA"],
		["Pae_I", "GCATG^C"], ["PaeR7_I", "C^TCGHG"], ["Pag_I", "T^CATGA"], ["Pal_I", "GG^CC"],
		["Pau_I", "G^CGCGC"], ["Pce_I", "AGG^CCT"], ["Pci_I", "A^CATGT"], ["Pct_I", "GAATGCN^"],
		["Pdi_I", "GCC^GGC"], ["Pdm_I", "GAANN^NNTTC"], ["Pfl23_II", "C^GTSCG"], ["PflB_I", "CCANNNN^NTGG"],
		["PflF_I", "GACN^NNGTC"], ["PflM_I", "CCA(N)4^NTGG"], ["Pfo_I", "T^CCNGGA"], ["Pho_I", "GG^CC"],
		["PinA_I", "A^CCGGT"], ["Ple_I", "GHGTC(N)4^"], ["Ple19_I", "CGAT^CG"], ["PmaC_I", "CAC^GTG"],
		["Pme_I", "GTTT^AAAC"], ["Pml_I", "CAC^GTG"], ["Ppi_I", "(N)5^(N)7GAAC(N)5CTC(N)13^"], ["Pps_I", "GHGTC(N)4^"],
		["Ppu10_I", "A^TGCAT"], ["PpuM_I", "RG^GWCCY"], ["PpuX_I", "RG^GWCCY"], ["PshA_I", "GACNN^NNGTC"],
		["PshB_I", "AT^TSAT"], ["Psi_I", "TTS^TSA"], ["Psp5_II", "RG^GWCCY"], ["Psp6_I", "^CCWGG"],
		["Psp124B_I", "GAGCT^C"], ["Psp1406_I", "AA^CGTT"], ["PspA_I", "C^CCGGG"], ["PspE_I", "G^GTNACC"],
		["PspG_I", "^CCWGG"], ["PspL_I", "C^GTSCG"], ["PspN4_I", "GGN^NCC"], ["PspOM_I", "G^GGCCC"],
		["PspP_I", "G^GNCC"], ["PspPP_I", "RG^GWCCY"], ["PspX_I", "VC^TCGAGB"], ["Psr_I", "(N)5^(N)7GAAC(N)6TAC(N)12^"],
		["Pst_I", "CTGCA^G"], ["Psu_I", "R^GATCY"], ["Psy_I", "GACN^NNGTC"], ["Pvu_I", "CGAT^CG"],
		["Pvu_II", "CHG^CTG"], ["Rca_I", "T^CATGA"], ["Rsa_I", "GT^AC"], ["Rsr_II", "CG^GWCCG"],
		["Rsr2_I", "CG^GWCCG"], ["Sac_I", "GHGCT^C"], ["Sac_II", "CCGC^GG"], ["Sal_I", "G^TCGAC"],
		["SanD_I", "GG^GWCCC"], ["Sap_I", "GCTCTTCN^"], ["Sat_I", "GC^NGC"], ["Sau3A_I", "^GATC"],
		["Sau96_I", "G^GNCC"], ["Sbf_I", "CCTGCA^GG"], ["Sca_I", "HGT^ACT"], ["Sch_I", "GHGTC(N)5^"],
		["ScrF_I", "CC^NGG"], ["Sda_I", "CCTGCA^GG"], ["Sdu_I", "GDGCH^C"], ["SexA_I", "A^CCWGGT"],
		["SfaN_I", "GCATC(N)5^"], ["Sfc_I", "C^TRYHG"], ["Sfi_I", "GGCC(N)4^NGGCC"], ["Sfo_I", "GGC^GCC"],
		["Sfr274_I", "C^TCGHG"], ["Sfr303_I", "CCGC^GG"], ["Sfu_I", "TT^CGAA"], ["Sgf_I", "GCGAT^CGC"],
		["SgrA_I", "CR^CCGGYG"], ["SgrB_I", "CCGC^GG"], ["Sin_I", "G^GWCC"], ["Sla_I", "CTCGAG"],
		["Sma_I", "CCC^GGG"], ["Smi_I", "ATTT^AAAT"], ["SmiM_I", "CAYNN^NNRTG"], ["Sml_I", "C^TYRAG"],
		["Smu_I", "CCCGCNNNN^NN"], ["SnaB_I", "TSC^GTS"], ["SpaH_I", "GCATG^C"], ["Spe_I", "A^CTHGT"],
		["Sph_I", "GCATG^C"], ["Srf_I", "GCCC^GGGC"], ["Sse9_I", "^AATT"], ["Sse8387_I", "CCTGCA^GG"],
		["SseB_I", "HGG^CCT"], ["Ssi_I", "CC^GC"], ["Ssp_I", "AAT^ATT"], ["SspB_I", "T^GTSCA"],
		["Sst_I", "GHGCT^C"], ["Sst_II", "CCGC^GG"], ["Stu_I", "AGG^CCT"], ["Sty_I", "C^CWWGG"],
		["StyD4_I", "^CCNGG"], ["Sun_I", "C^GTSCG"], ["Swa_I", "ATTT^AAAT"], ["Taa_I", "ACN^GT"],
		["Tai_I", "ACGT^"], ["Taq_I", "T^CGA"], ["Taq_II", "GACCGA(N)11^"], ["Tas_I", "^AATT"],
		["Tat_I", "W^GTACW"], ["Tau_I", "GCSG^C"], ["Tel_I", "GACN^NNGTC"], ["Tfi_I", "G^AWTC"],
		["Tha_I", "CG^CG"], ["Tli_I", "CTCGAG"], ["Tru1_I", "T^TAA"], ["Tru9_I", "T^TAA"],
		["Tsc_I", "ACGT^"], ["Tse_I", "G^CWGC"], ["Tsp45_I", "^GTSAC"], ["Tsp509_I", "^AATT"],
		["TspDT_I", "ATGAA(N)11^"], ["TspE_I", "^AATT"], ["TspGW_I", "ACGGA(N)11^"], ["TspR_I", "NNCASTGNN^"],
		["Tth111_I", "GACN^NNGTC"], ["TthHB8_I", "T^CGA"], ["Van91_I", "CCA(N)4^NTGG"], ["Vha464_I", "C^TTAAG"],
		["Vne_I", "G^TGCAC"], ["VpaK11B_I", "G^GWCC"], ["Vsp_I", "AT^TAAT"], ["Xag_I", "CCTNN^NNNAGG"],
		["Xap_I", "R^AATTY"], ["Xba_I", "T^CTAGA"], ["Xce_I", "RCATG^Y"], ["Xcm_I", "CCA(N)5^(N)4TGG"],
		["Xho_I", "C^TCGAG"], ["Xho_II", "R^GATCY"], ["Xma_I", "C^CCGGG"], ["Xma_III", "C^GGCCG"],
		["XmaC_I", "C^CCGGG"], ["XmaJ_I", "C^CTAGG"], ["Xmi_I", "GT^MKAC"], ["Xmn_I", "GAANN^NNTTC"],
		["Xsp_I", "C^TAG"], ["Zho_I", "AT^CGAT"], ["Zra_I", "GAC^GTC"], ["Zsp2_I", "ATGCA^T"]};
	
	procedure actual_data(rec_triple); 				-- read a record using its triple
	procedure cds_start_and_translation(stg); 		-- try translating a sequence in all three frames; return bet piece
	procedure tom_in_protein(stg); 					-- find first start codon in DNA translation
	procedure align_by_mers(stg1,stg2,mer_size);	-- alignment of two strings by the  common mer method.
	procedure make_random_dna(n); 					-- make random sequences of bases of given length
	procedure histo_dna(tup,name); 					-- histogramming
	procedure prepare_dna(stg);						-- remove junk from fasta record
	procedure translate_dna(stg); 					-- translate dna to protein
	procedure rev_comp_dna(stg);  					-- reverse and complement bases in DNA string
	procedure reverse_dna(stg);  					-- reverse DNA string
	procedure complementary_dna(stg);  				-- complement bases in DNA string
	procedure peek(file_name,strt,n);				-- peek at start of specified file	 			
	procedure survey_ncbi_active_list(file_name);	-- initial survey of NCBI active list
	procedure index_ncbi_active_list(file_name);	-- index the NCBI active list, producing and writing an index flat
	procedure gb_record(gb_record_name);			-- fetch a genbank record by its name
	procedure get_by_gi(gi_no);						-- get a genbank record by its genbank gi number
	procedure data_blocks_in(lines);				-- extract the data blocks from a Genbank record
	procedure ncbi_hash(stg); 						-- hash an ncbi GB name
	procedure cut_by_enzymes(flat_stg,enz_name_list); 	-- return the list of pieces into which a string of bases 
														-- is cut by a specified list of restriction enzymes, 
														-- applied in the specified order, return triples [enzA,substg,enzB]
	procedure group_strings_by_alignment(tup_of_stgs,tags,mer_size); 	-- find closely matched subgroups in a list of strings


		-- *************************** Unigene Analysis Codes ***************************

	procedure filter_psl(file_name);				-- filter one of Toto's .psl output files 			
--	procedure build_examine_unigene_indices();		-- build indices of unigene files and then check them (moved to test program at end)
	procedure merge_psl_summaries(file_name_list,out_name);			-- merge a list of psl summary files 			
	procedure build_unigene_index(list_of_unigene_filenames,saved_index_name); 	-- prepare and save a unigene file index
	procedure tag_and_index_ug_files(file_name_tup); 	-- prepare a unigene file index also showing the starting position of the cds 
														-- this routine can prepare a single index for multiple files
														-- Note that this routine returns an index object, leaving it 
														-- to the calling routine (see build_unigene_indices) to save the index object
	procedure triples_in_several(input_file);		-- look for cross-species triples; for use with remotely related species 			
	procedure exon_length_compare_histogram(input_file1,input_file2);	-- histogram of exon lengths
	procedure histo_lengths(input_file);			-- build histogram of interior exon lengths		
	procedure triple_occurences(input_file,rix_file);	-- look for multiply occuring triples and assemble annotated groups			
	procedure show_moduli(thist_line);				-- transform a .thist line to show exon moduli and start of reading frame	
	procedure find_genbank_id(rec_start,rec_len,file_handle); 	-- read a genbank id from the header line

end genetic_code_pak;

package body genetic_code_pak;			-- collection of genetic code utilities
	use tkw,string_utility_pak,sort_pak,random_pak,rix_flat_pak,get_lines_pak;
							-- use rix_flat_pak to get the gene annotations	or data			

	const allowed_pept_block_charsA := "AGVLIPFMWCNQSTVRKHDEY ";			-- allowed peptide charcters
	const allowed_pept_block_charsB := "agvlipfmwcnqstvrkhdey ";
	const allowed_prefix_chars := "0123456789 ";	

 	const DNA_strand := 9,q_start := 12,q_end := 13;
	
	var ihandle,ohandle_1,ohandle_2,ohandle_n,badhandle,start_line := OM,histo_exon_count,sec_count;
	var hv_debug,lines,lines_with_trips,ohandle;		
 	var rix_obj;			-- index prepared for access to Unigene information
 	var alignment_count := 0;		-- for tracking progress ith alignments

procedure get_by_gi(gi_no);		-- get a record by its genbank gi number
--	print("fetching url: ",url := "ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=" + gi_no); 
	return http_get(url);
end get_by_gi; 

procedure http_get(url); 			-- get url using tkw socket

	Tk ?:= tkw();			-- open the main TK window if necessary
	url_host := break(url,"/");

	sock := Tk("socket",["www." + url_host + ":80","text"]);
       						-- open socket communication with the HTML host
 	lines := [];
    sock(OM) := "GET " + url;

	while (resp := sock(OM)) /= ""  and resp /= "" loop lines with:= resp; end loop;
	lines with:= resp; 			-- capture last line
	
	sock.socket_close();       -- close the socket
 	return lines;
 	
 end http_get;

procedure align_by_mers(stg1,stg2,mer_size);		-- alignment of two strings by the  common mer method.
	-- we sort the k-mers of the two strings together, atttaching the string position and the string source
--print("align_by_mers: ",stg1," ",stg2," ",mer_size);	
	if stg1 = OM or stg2 = OM then return OM; end if;				-- armoring
	
	ns1 := #stg1; ns2 := #stg2;				-- get the length of the two strings to be aligned
	
	msm1 := mer_size - 1; msp1 := mer_size + 1; msp2 := mer_size + 2;		-- calculate various utility lengths
	
					-- extract string-identity and address-tagged mers of the stated size from both strings
	nsm := #(sorted_mers := merge_sort([stg1(j..j + msm1) + "A" + str(j): j in [1..ns1 - msm1]] +  
											[stg2(j..j + msm1) + "B" + str(j): j in [1..ns2 - msm1]])); 
 
				-- find the set of mers and positions which occur in both groups, and map the start of each 
				-- A-string matched mer into the set of positions of all its string B matches, representing these as offsets.
	B_posn_set_of_A_posn := {};
	
					-- iterate over all the first elements of groups of like mers
	for mer1 = sorted_mers(j) | j = 1 or (mer1(1..mer_size) /= sorted_mers(j - 1)(1..mer_size)) loop
	
				-- get the identity tag and mer of the first element of the group
		tag := mer1(msp1); the_real_mer := mer1(1..mer_size);		-- note that the identity tag can be either 'A' or 'B'

		matching_locs := {};		-- set of locs in the B-string which match this A-string mer, or vice-versa

		for  k in [j + 1..nsm] loop		-- search foward for the end of the current group
			
			if (the_real_mer2 := sorted_mers(k))(1..mer_size) /= the_real_mer then 
					-- have reached the end if the current group; matching_locs is now the set of all
					-- mers with tag oppositie to that of the first element of the group (called 'A' in these comments)
					-- calculate offset from each 'A' mer to all of the matching 'B'-mer positions				 

				for kk in [j..nsm] loop	 -- iterate again over the group, matching each 'A' element to the colleted ste of 'B' elements

					if (mer3 := sorted_mers(kk))(1..mer_size) /= the_real_mer then exit; end if; -- exit when end of group is reached
					if mer3(msp1) /= tag then continue; end if;			-- here we bypass all 'B' elements	
					loc_of_A := unstr(mer3(msp2..));					-- get the address of heah 'A' element encountered
 
 					for B_loc in matching_locs loop		-- calulate offset from the 'A' element to each 'B' element previously collected 
  						offset := B_loc - loc_of_A; 	-- add the offest to this map, which sends each 'A' position 
  														-- into the set of all 'B' positions at which matching mers are foun
 						B_posn_set_of_A_posn(loc_of_A) := (B_posn_set_of_A_posn(loc_of_A)?{}) with offset;
 
 					end loop;
 				
				end loop;
				
				exit;   -- advance to the next group of identical mers

			end if;
						-- here we are colleting all mers with tag opposite to that of the first group eelement
			if (mer2 := sorted_mers(k))(msp1) = tag then continue; end if;
								-- count only mers with opposite tags

			matching_locs with:= unstr(mer2(msp2..));
						-- collect the address of the mers in the group having the same tag as the first group element 
			
		end loop;
			
	end loop;	-- on exit from this loop we have built the map B_posn_set_of_A_posn, which sends each address in the
				-- 'A' string into a;; 'B' string addresses at which mathing mers are found

			-- take all the offsets and prune then down to those which occur at least three times
	histo_of_offsets := {};		-- for this, build a histogram of the number of times each offset occurs
	for [merix,matches] in B_posn_set_of_A_posn, offs in matches loop
		histo_of_offsets(offs) := (histo_of_offsets(offs)?0) + 1;
	end loop; 		

	candidate_offsets := {offs in domain(histo_of_offsets) | histo_of_offsets(offs) > 2};  -- the set of 'significant' offsets
--print("candidate_offsets: ",candidate_offsets);
	
	B_posn_set_of_A_posn := 		-- eliminate all 'insignificat' offsets from the 'B_posn_set_of_A_posn' map
			{[merix,survivors]: [merix,matches] in B_posn_set_of_A_posn | (survivors := matches * candidate_offsets) /= {}};
	
--print("B_posn_set_of_A_posn: ",{[stg1(merix..merix + msm1),matches]: [merix,matches] in B_posn_set_of_A_posn});
				
				-- Next we work left to right thru the A string, extending each match to its maximum length. 
				-- When a match is extended, the match with the same offset at the next position is removed.
	offsets_with_maxmatch_length := {};			-- will collect this map, which sends each into the offset at which a match occurs,
												-- and the lenth of the longes such match (as a pair [offs,longest_match])

				-- loop ovear all the 'A' string addresses for which there are matching 'B' string addresses
	for spm_in_A in [1..ns1] | (bps := B_posn_set_of_A_posn(spm_in_A)) /= OM and bps /= {} loop
--print("iterating spm_in_A: ",spm_in_A);		
		
		for offs in bps loop		-- for each offset to a matching 'B' position
			
			spm_in_B := spm_in_A + offs;			-- get the matching 'B' address
			longest_match := mer_size;				-- will try to extend the match at this address

			for ma_len in [mer_size + 1..(ns2 - spm_in_B) min (ns1 - spm_in_A)] loop
								 		-- iterate, checking the characters following the known match
--print("extending longest_match:: ",spm_in_A," ",spm_in_B," ",ma_len," ", stg1(spm_in_A..spm_in_A + ma_len - 1)," ", stg2(spm_in_B..spm_in_B + ma_len - 1));
				if stg1(spm_in_A + ma_len - 1) /= stg2(spm_in_B + ma_len - 1) then exit; end if;		-- exit at first nonmatching character
				longest_match := ma_len;			-- otherwise note the longer match

				subsumed_posn := spm_in_A + ma_len - mer_size;
						-- position in which a match of size k is implied by the longe match just found
						-- remove the mach impled by the loonger mach just found
				if (bpss := B_posn_set_of_A_posn(subsumed_posn)) /= OM then B_posn_set_of_A_posn(subsumed_posn) := bpss less offs; end if;
			end loop;
									-- put the maximal match just found on record in the 'offsets_with_maxmatch_length' map
			offsets_with_maxmatch_length(spm_in_A) := (offsets_with_maxmatch_length(spm_in_A)?{}) with [offs,longest_match];

		end loop;

	end loop;
	
	if offsets_with_maxmatch_length = [] then return OM; end if;			-- no match signal

--print("offsets_with_maxmatch_length: ",{[stg1(merix..merix + ma_len - 1),moffs]: [merix,ms_and_lens] in offsets_with_maxmatch_length,[moffs,ma_len] in ms_and_lens});
	
				-- sort the maximal matches in order of decreasing length, and for those of n_mer and n_mer + 1 length put those 
				-- not confirmed after those confirmed

	matches_sorted_by_length := merge_sort([[-ma_len,[merix,[moffs,ma_len]]]: [merix,ms_and_lens] in offsets_with_maxmatch_length,[moffs,ma_len] in ms_and_lens]);
	
	if matches_sorted_by_length = [] then return OM; end if; 					
	
	matches_sorted_by_length := [y: [-,y] in matches_sorted_by_length];		-- drop the sort key
--print("matches_sorted_by_length: ",ml := [[stg1(strt..strt + ln - 1),offs]: [strt,[offs,ln]] in matches_sorted_by_length]," ",#ml);

				-- iterate over the mers, in their sorted order, placing all those which do not conflict with any already placed
				-- into a list of sorted matched blocks.
	pruned_matches := [];

	for matchtrip = matches_sorted_by_length(j) loop
		
		if exists matchtrip2 in pruned_matches | matches_conflict(matchtrip,matchtrip2) then continue; end if;
		pruned_matches with:= matchtrip;

	end loop;
	
	pruned_matches := trim_ups_and_downs(merge_sort(old_pruned_matches := pruned_matches));
					-- sort the pruned matches by their starting positions and remove unlikely zigzags
	matched_blocks := [stg1(strt..strt + ln - 1): [strt,[offs,ln]] in pruned_matches];		-- the succession of blocks actually matched

--print("pruned_matches:: ",pruned_matches);
--print("pruned_matches: ",ml := [[stg1(strt..strt + ln - 1),strt,offs]: [strt,[offs,ln]] in pruned_matches]," ",#ml);
--print("matched_blocks: ",[str(strt) + ": " + str(offs) + ": " + stg1(strt..strt + ln - 1): [strt,[offs,ln]] in pruned_matches]);
	
	[-,[first_offs,-]] := pruned_matches(1);								-- the offset of the first matched block
	[last_strt,[last_offs,last_ln]] := pruned_matches(#pruned_matches);		-- data for the last matched block
	last_mc_A := last_strt + last_ln; last_mc_B := last_strt + last_ln + last_offs;
	
	lead_pair := if first_offs < 0 then [stg1(1..-first_offs),""] else ["",stg2(1..first_offs)] end if;
						-- the first 'intervening' pair of blocks

	tail_pair := [stg1(last_mc_A..),stg2(last_mc_B..)];		-- the last 'intervening' pair of blocks
				
	block_pairs_between := [lead_pair] + 
							[[stg1(endjm1 := (pmjm1 := pruned_matches(j - 1))(1) + pmjm1(2)(2)..strtj := (pmj := pruned_matches(j))(1) - 1),
								stg2(endjm1 + pmjm1(2)(1)..strtj + pmj(2)(1))]: j in [2..#pruned_matches]] with tail_pair;

	nbpb := #(block_pairs_between := [bottom_match(pair): pair in block_pairs_between]);		-- find the best bottom match of these pairs
--print("block_pairs_between: ",block_pairs_between);

				-- match the zones between these blocks by allowing only a single break and positioning it 
				-- so as to secure the largest number of matched characters. Insert underbars to indicate
				-- the position and size of the resulting break.
				-- Lower-case all non-matching characters in the result.

	res := [res1 := "" +/ [if odd(j) then block_pairs_between((j + 1)/2)(1) else matched_blocks(j/2) end if: j in [1..2 * nbpb - 1]],
			"" +/ [if odd(j) then block_pairs_between((j + 1)/2)(2) else matched_blocks(j/2) end if: j in [1..2 * nbpb - 1]]];

	pieces := [p in segregate(res1,"ABCDEFGHIJKLMNOPQRSTUVWXYZ") | #p > 0 and p(1) in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"];
	
			-- form the histogram of matching run sizes
	histo := {};  for p in pieces loop np := #p; histo(np) := (histo(np)?0) + 1; end loop;
	
			-- sort the histogram of matching run sizes into decreasing size order, and attach it to the output
--print("res: ",res);
	return res with [[-n,y]: [n,y] in merge_sort([[-n,hv]: [n,hv] in histo])];
	
end align_by_mers;

procedure group_strings_by_alignment(tup_of_stgs,tags,mer_size); 	-- find closely matched subgroups in a list of strings
print("starting group_strings_by_alignment: ",#tup_of_stgs," ",time());
	
	stand_addr_offset := 100000; msm1 := mer_size - 1; msp1 := mer_size + 1;
	
	merlist := [];			-- the list of mers to be built
	
			-- build a list of address-tagged mers, with offsets of 100K as we go from source string to source string
	for stg = tup_of_stgs(j) loop

		sl := #stg; offs := j * stand_addr_offset;
		
		for ix in [1..sl - mer_size + 1] loop
			merlist with:= (stg(ix..ix + msm1) + str(ix + offs));		-- collect the mers into 'merlist'
		end loop;
 
 	end loop;
print("merlist built: ",#merlist," ",time());	
	merlist := merge_sort(merlist); -- sort these mers 
print("merlist sorted: ",time());	

	offset_hist := {};		-- in each group of like mers, form and histogram the offset belonging to different source strings 
	
	first_mer_in_group := (first_mer_and_addr := merlist(first_ix_in_group := 1))(1..mer_size);	-- the first element in a group of like mers
	
	for mer_and_addr = merlist(j) loop
	
		if (nextmer := mer_and_addr(1..mer_size)) /= first_mer_in_group then			-- a new group is starting; process the prior group
			prior_group := merlist(first_ix_in_group..j - 1);
			first_mer_in_group := nextmer; first_ix_in_group := j;
			for mer_k in prior_group, mer_m in prior_group | 				-- collect and histogram the matching offsets between distinct strings 
				(stgno1 := (addr1 := unstr(mer_k(msp1..)))/stand_addr_offset) < 
													(stgno2 := (addr2 := unstr(mer_m(msp1..)))/stand_addr_offset) loop
				offset_hist([stgno1,stgno2,addr2 - addr1]) := (offset_hist([stgno1,stgno2,addr2 - addr1])?0) + 1;
			end loop;
			
		end if;

	end loop;

	prior_group := merlist(first_ix_in_group..);					-- now collect the last group
	for mer_k in prior_group, mer_m in prior_group | 				-- collect and histogram the matching offsets between distinct strings 
		(stgno1 := (addr1 := unstr(mer_k(msp1..)))/stand_addr_offset) < 
											(stgno2 := (addr2 := unstr(mer_m(msp1..)))/stand_addr_offset) loop
		offset_hist([stgno1,stgno2,addr2 - addr1]) := (offset_hist([stgno1,stgno2,addr2 - addr1])?0) + 1;
	end loop;
--print("offset_hist: ",#offset_hist," ", time());
				
				-- prune the histogram by dropping offsets that occur less than six times, and rebuild it as a map
				-- from pairs [stno_1,stgno_2] to a set of pairs [offset,stg1_addr]
	offset_hist := {p in offset_hist | p(2) > 4};
print("#pruned offset_hist: ",#offset_hist," ", time());
	
	relev_offsets_and_addrs := {};
	first_mer_in_group := (first_mer_and_addr := merlist(first_ix_in_group := 1))(1..mer_size);	-- the first element in a group of like mers
	
	for mer_and_addr = merlist(j) loop
	
		if (nextmer := mer_and_addr(1..mer_size)) /= first_mer_in_group then			-- a new group is starting; process the prior group
			prior_group := merlist(first_ix_in_group..j - 1);
			first_mer_in_group := nextmer; first_ix_in_group := j;
			for mer_k in prior_group, mer_m in prior_group | 				-- collect and histogram the matching offsets between distinct strings 
				(stgno1 := (addr1 := unstr(mer_k(msp1..)))/stand_addr_offset) < 
													(stgno2 := (addr2 := unstr(mer_m(msp1..)))/stand_addr_offset) loop
				if offset_hist([stgno1,stgno2,(offs := addr2 - addr1)]) /= OM then 
					relev_offsets_and_addrs([stgno1,stgno2]) := (relev_offsets_and_addrs([stgno1,stgno2])?{}) with [offs,addr1];
				end if;
			end loop;
			
		end if;

	end loop;
 
	prior_group := merlist(first_ix_in_group..);					-- now collect the last group

	for mer_k in prior_group, mer_m in prior_group | 				-- collect and histogram the matching offsets between distinct strings 

		(stgno1 := (addr1 := unstr(mer_k(msp1..)))/stand_addr_offset) < 
											(stgno2 := (addr2 := unstr(mer_m(msp1..)))/stand_addr_offset) loop

		if offset_hist([stgno1,stgno2,(offs := addr2 - addr1)]) /= OM then 
			relev_offsets_and_addrs([stgno1,stgno2]) := (relev_offsets_and_addrs([stgno1,stgno2])?{}) with [offs,addr1];
		end if;

	end loop;
	
				-- for each offset, sort the corresponding set of values {stg1_addr}, 
				-- and find the runs of consecutive integers, rewriting the map as [offset,[[stg1_addr,len_of_run],..]]
	relev_offsets_and_addrs := {[x,{[offs,rr]: offs in domain(y) | 
						0 max/ [rl: [strt,rl] in (rr := run_rep(merge_sort(y{offs})))] > 4}]: [x,y] in relev_offsets_and_addrs};
print("relev_offsets_and_addrs: ",#relev_offsets_and_addrs," ",time());
			
				-- form a map from pairs [stno_1,stgno_2] to length_of_longest_run
	longest_run_with_offset_and_addr := {[x,longest_run(y)]: [x,y] in relev_offsets_and_addrs | #y /= 0};			
print("longest_run_with_offset_and_addr: ",#longest_run_with_offset_and_addr," ",time());

	longest_run_with_offset_and_addr := [v: [u,v] in merge_sort([[-y(3),[x,y]]: [x,y] in longest_run_with_offset_and_addr])];
	connected := [[[stix1,stix2],[trip(1)  - (stix2 - stix1) * stand_addr_offset,trip(3)]]: [[stix1,stix2],trip] in longest_run_with_offset_and_addr | trip(3) >= 20];
	
--print("connected: ",#connected); 
	parent := {};		-- maps each string index into its first parent index in the "longest_run" order, and corresponding offset
	
	for [[stix1,stix2],[offs,-]] in connected loop parent(stix2) := parent(stix2)?[stix1,offs]; end loop;
	unconnected := {1..#tup_of_stgs} - {x(1): x in range(parent)} - domain(parent);
	
print("parent: ",#parent," ",time());
	gross_offset := {};
	
	for stix in domain(parent) loop
		offs := 0; current := stix;
		while (pc := parent(current)) /= OM loop [par,par_offs] := pc; offs +:= par_offs; current := par; end loop;
		gross_offset(stix) := [current,offs];		-- be sure to capture the root of the group
	end loop;
print("gross_offset; ",#gross_offset," roots: ",{r: [c,[r,o]] in gross_offset});
	
	inv_offset := {[r,[c,-o]]: [c,[r,o]] in gross_offset};
	
	for r in domain(inv_offset) loop
		mino := min/[x(2): x in (ixs_for_r := inv_offset{r} with [r,0])];
		lines_in_group := [rpad(tags(ix) + str(ix),4) + (o - mino) * " " + tup_of_stgs(ix): [ix,o] in ixs_for_r];
		
		if (nlig := #lines_in_group) = 1 then 

			print(lines_in_group(1));		-- print the only line there is

		elseif nlig = 2 then			-- compare second line to first, using two-way comparison

			[line1,line2] := lines_in_group; l1f := line1(1..4); l2f := line2(1..4);
			line1_al := line1(5..); span(line1_al," \t"); line2_al := line2(5..); span(line2_al," \t");

			[line1,line2] := align_by_mers(line1_al,line2_al,2); line1 := case_change(line1,"lu"); line2 := case_change(line2,"lu");
--print("nlig = 2; ",[line1,line2]," ",time());
			ml := #line1 min #line2; 
			compared := "" +/ [if (l1j := line1(j)) = (l2j := line2(j)) then " " elseif l2j = " " then "." else l2j end if: 
							j in [1..ml]] + line1(ml + 1..) + line2(ml + 1..);
			print(l1f + line1); print(l2f + compared);
 
 		else		-- form consensus and compare to consensus

			longest := max/[#line: line in lines_in_group];
			consensus := "    " +/ [consensus_char([line(j): line in lines_in_group | #line >= j]): j in [5..longest]];
			print(consensus);
			for line in lines_in_group loop
				compared := line(1..4) +/ [if (l1j := line(j)) = (l2j := consensus(j)) then " " else l1j end if: j in [5..#line]];
				print(compared); 
			end loop;

		end if;
		
		print();		-- blank line beween groups
	end loop;
--print("unconnected: ",unconnected);
	for ix in unconnected loop
		print("\n",rpad(tags(ix) + str(ix),4) + tup_of_stgs(ix));		-- print the unconnected lines
	end loop;
	
end group_strings_by_alignment;

procedure looky(x); print("looky: ",x); return x; end looky;

procedure consensus_char(tup_of_chars); 
	count := {}; for c in tup_of_chars | c /= " " loop count(c) := (count(c)?0) + 1; end loop;
	maxc := max/range(count); must := exists [c,n] in count | n = maxc;
	return c?" "; 
end consensus_char;

procedure longest_run(offsets_to_runs_map); 		-- get the longest run in an offsets_to_runs_map

	maxrlen := 0 max/ [runlen: [offs,runs] in offsets_to_runs_map, [runstrt,runlen] in runs];
	 
	must := exists [offs,runs] in offsets_to_runs_map, [runstrt,runlen] in runs | runlen = maxrlen;
	
	return [offs,runstrt,runlen];
	
end longest_run;

procedure run_rep(tup); 		-- represent a tuple of integers is run format
--print("run_rep: ",tup);	
	if (first_in_run := tup(start_of_run := 1)) = OM then return []; end if;			-- case of empty argument
	
	runs := [];				-- will collect runs of successive integers

	for n = tup(j) | j > 1 and n /= tup(j - 1) + 1 loop
		runs with:= [first_in_run,j - start_of_run];		-- collect the run which has ended
		start_of_run := j; first_in_run := n;
	end loop;
	
	runs with:= [first_in_run,#tup - start_of_run + 1];		-- collect the last run
--print("runs: ",runs);	
	return runs; 
	
end run_rep;

procedure trim_ups_and_downs(pruned_matches); 		-- remove unlikely zigzags from the tuple of pruned matches

	grouped_matches := [];			-- regroup the matches into spans of identical offset 
	
	prior_end := 0;
	
	for x = pruned_matches(j) | j > 1 and pruned_matches(j - 1)(2)(1) /= x(2)(1) loop
		grouped_matches with:= pruned_matches(prior_end + 1..prior_end := j - 1);		-- collect a group
	end loop;

	if prior_end < #pruned_matches then grouped_matches with:= pruned_matches(prior_end + 1..); end if;
			-- collect the last group if any
	ngm := #grouped_matches;

	while exists group = grouped_matches(j) | j > 1 and j < ngm and net_gain(stg1,stg2,grouped_matches(j - 1..j + 1)) < 0 loop
		grouped_matches(j..j) := [];		-- remove the profitless zigzag group
		ngm := #grouped_matches;
	end loop;
	
	return +/grouped_matches; 
	
end trim_ups_and_downs;

procedure net_gain(stg1,stg2,group_in_context);		-- calculate the net gain for including a group in its context

	[prior_g,this_g,next_g] := group_in_context;		-- examine the group and its context
	[-,[prior_offs,-]] := prior_g(1); [-,[next_offs,-]] := next_g(1);  
	[-,[this_offs,-]] := this_g(1);
	
	if (jump1 := this_offs - prior_offs) * (jump2 := next_offs - this_offs) > 0 then return 0; end if; -- since change in offseet does not reverse
	
	loss := (abs(jump1) + abs(jump2) - abs(next_offs - prior_offs)) / 2;
				-- we count the gain as the number of characters matched in the whole range from the
				-- start of this group to its end
	
	gain := 0 +/ [ln: [-,[-,ln]] in this_g];			-- summ the length of the matches in the froups

	return gain - loss;			-- return the net gain
	
end net_gain;

procedure bottom_match(pair); 		-- match a pair of strings optimally, allowing only a single break

	[stg1,stg2] := pair;		-- upack data
	
	if (ns1 := #stg1) = (ns2 := #stg2) then 
		return ["" +/ [if c = stg2(j) then c else case_change(c,"ul") end if: c = stg1(j)],
				"" +/ [if c = stg1(j) then c else case_change(c,"ul") end if: c = stg2(j)]];
	end if;
	
	if ns1 > ns2 then 		-- insert blanks into stg2
							-- find the position in stg2, from 0 to its length, after which the blanks will be inserted.
							-- this is done by scoring all the matches between stg2(1..j) and stg1(1..j), plus
							-- all the matches between stg2(j + 1..) and the like number of final characters of 
							-- stg1, and picking the best
		j_best := best_split(stg1,stg2);
		stg2(j_best + 1..j_best) := (ns1 - ns2) * "_";
	else 							-- insert blanks into stg1
		j_best := best_split(stg2,stg1);
		stg1(j_best + 1..j_best) := (ns2 - ns1) * "_";
	end if;

	return ["" +/ [if c = stg2(j) then c else case_change(c,"ul") end if: c = stg1(j)],
				"" +/ [if c = stg1(j) then c else case_change(c,"ul") end if: c = stg2(j)]];
		
end bottom_match;

procedure best_split(long_stg,short_stg); 		-- split to match match a pair of strings optimally, allowing only a single break
												-- see comment above
												-- we start by matching the short string to the very end of the long
												-- string, and then move sucessive characters to its front, updating the
												-- match score and trackig the optimal position.
	best_split_after := 0;
	lendif := #long_stg - (nss := #short_stg);
	best_score := current_score := #[c: c = short_stg(j) | c = long_stg(j + lendif)];
	
	for c = short_stg(j) loop
		if c = long_stg(j + lendif) then current_score -:= 1; end if;
		if c = long_stg(j) then current_score +:= 1; end if;
		if current_score > best_score then best_split_after := j; best_score := current_score; end if;
	end loop;
	
	return best_split_after;
	
end best_split;

procedure matches_conflict(matchtrip,matchtrip2);	-- two matches conflict if their B-string positions overlap
													-- or if their A-string positions and B-string positions are differently ordered
		
	[strt_A,[offs,ln]] := matchtrip; [strt_A2,[offs2,ln2]] := matchtrip2;				-- unpack the data

	ep_A := strt_A + ln - 1; ep_A2 := strt_A2 + ln2 - 1;		-- end position of the matches on the A string
	strt_B := strt_A + offs; strt_B2 := strt_A2 + offs2;		-- sart position  of the matches on the B string
	ep_B := strt_B + ln - 1; ep_B2 := strt_B2 + ln2 - 1;		-- end position of the matches on the B string
	
	if (strt_A > strt_A2 and strt_B < strt_B2) or (strt_A < strt_A2 and strt_B > strt_B2) then return true; end if;
			-- since A-string positions and B-string positions of the match are differently ordered
	
			--if the start or end of the second A-zone lies in the first A-zone then there is a conflict
	if (strt_A2 >= strt_A and strt_A2 <= ep_A) then return true; end if;
	if (ep_A2 >= strt_A and ep_A2 <= ep_A) then return true; end if;
	
		--if the start of the first A-zone lies in the second A-zone then there is a conflict
	if (strt_A >= strt_A2 and strt_A <= ep_A2) then return true; end if;

		--if the start or end of the second B-zone lies in the first B-zone then there is a conflict
	if (strt_B2 >= strt_B and strt_B2 <= ep_B) then return true; end if;
	if (ep_B2 >= strt_B and ep_B2 <= ep_B) then return true; end if;
	
		--if the start of the first B-zone lies in the second B-zone then there is a conflict
	if (strt_B >= strt_B2 and strt_B <= ep_B2) then return true; end if;

	return false; 			-- no conflict
									
end matches_conflict;
	
	procedure cds_start_and_translation(stg); 	-- try translating a sequence in all three frames; return best piece
--print("cds_start_and_translation: ",stg);
 		if #(stg := prepare_dna(stg)) < 3 then return [1,""]; end if;

		max_len := frame_start := offs_to_max := 0; max_stg := "";		-- will find longest string

		for j in [1..3] loop 

			pieces := breakup(translate_dna(stg(j..)),"\r");		-- uninterupted pieces of the translation

			for piece =  pieces(k) | (tom_p := tom_in_protein(piece))(2) /= "" and (np := #piece) > max_len loop 

				max_len := np; frame_start := j; max_translation := piece;  max_onfrom_m := piece(tom_p(1) + 1..);
				offs_to_max := 3 * tom_p(1) + frame_start +/ [3 * (#pce + 1): pce in pieces(1..k - 1)];

			end loop;
			
		end loop;
--print(stg(offs_to_max..)," ",max_onfrom_m," \n\n",translate_dna(stg(offs_to_max..)));
		return [offs_to_max,max_onfrom_m];
		
	end cds_start_and_translation;

	procedure make_random_dna(n); 				-- make random sequences of bases of given length
		rand_handle := start_random(["a","t","g","c"],OM); print(+/[random(rand_handle): j in [1..50]]);
	end make_random_dna;

	procedure histo_dna(tup,name); 				-- histogramming
		Tk ?:= Tkw();
		toplev := Tk("toplevel","10,10"); toplev(OM) := name;
		canv := toplev("canvas","500,400"); canv("side") := "top";
		rects := [canv("rectangle",Tk.tostr([x := 10 + 10 * j,y := 390,x + 11,y - t])): t = tup(j)];
		for rect in rects loop rect("fill") := "gold"; end loop;
	end histo_dna;
	
	procedure tom_in_protein(stg); front := break(stg,"M"); return [#front,stg]; end tom_in_protein;

	procedure prepare_dna(stg); return case_change(suppress_chars(stg," \t0123456789"),"lu"); end prepare_dna;

	procedure translate_dna(stg); 
		if g_code = OM then g_code := {[x,z]: [x,y,z] in genetic_code}; end if;
		
		return "" +/[g_code(stg(i..i + 2))?"?": i in [1,4..#stg - 2]];
	end translate_dna;

	procedure rev_comp_dna(stg); 
		stg := case_change(stg,"lu");
		return +/[complement(stg(j)): j in [#stg,#stg - 1..1]]; 
	end rev_comp_dna;

	procedure reverse_dna(stg); 
		return +/[stg(j): j in [#stg,#stg - 1..1]]; 
	end reverse_dna;
	
	procedure complementary_dna(stg); 
		return +/[complement(stg(j)):  j in [1..#stg]]; 
	end complementary_dna;

procedure lines_with_runs(line_ix_set,run_len);				-- histogram runs of specified length in indicated set or tuple of lines

	lines_with := {};			

	for line_ix in line_ix_set loop

		pieces := breakup(line := lines(line_ix),"\t");			-- break summary line at tabs;
		runs_seen_in_this_line := {};			-- collection of interior triples seen so far in this line

		if (nel := #(exlens := breakup(pieces(6),","))) >= run_len + 2 then
							-- at least 1 interior run of specified length; get runs
			for j in [2..nel - run_len] | (run := join(exlens(j..j + run_len - 1),","))  notin runs_seen_in_this_line loop
				runs_seen_in_this_line with:= run;  
				lines_with(run) := (lines_with(run)?{}) with line_ix;
			end loop;
		
		end if;
		
	end loop; 
	
	return lines_with;
	
end lines_with_runs;

		-- ******************************************************************************
		-- *************************** Unigene Analysis Codes ***************************
		-- ******************************************************************************

			-- Here follow miscellaneous analyses of unigene files: preparation of .1, .2., .n, .bad, .smry, .smerge, .withoffs,
			--  and .thist files. There are also .D_smry and .Z_smry files etc., which are the single-species analogs of the .smerge
			-- these files have the following sources and forms:

			-- .1, .2., .n, .bad: files prepared form the original .psl files by slecting the longes-matching line
			-- and then segregating into genes with 1, 2, 3 or more exons, or bad-case genes.	 With the exception of
			-- these latter, each file entry consists of a header line giving general unigene information and a data line
			-- giving the exon and intron lengths, genome starting position of each exon, and genbank identifier.
			-- These files are prepared from the original .psl files by 'filter_psl'.

			-- .smry files: 1 line per gene, giving size of longest nonterminal exon, its positionin the list of exons,
			-- the coding sequence length, list of exon lenths, strand, genbank identifier of mRNA, and the list of intron lengths.
			-- This file is prepared from the .n files by 'filter_psl'.

			-- .smerge files: these are simple merges of several .smry files,  but with each line prefixed
			-- by a 1-letter code for the species to which the line belongs. They are prepared by 'merge_psl_summaries'.

			-- .thist files: these are inteded to group related genes and gene families for several species.
			-- but can be used for just one. they consist of a group header file stating the number of elements in a group,
			-- followed by several .smry lines, each prefixed by a 1-letter code for the species to which the line belongs.  
			-- They are prepared by 'triple_occurences'.
			
			-- the indexed unigene files assumed as input by the 'triple_occurences' routine are prepared using the
			-- 'tag_and_index_ug_files' and 'cds_start_and_translation' procedures.
			-- These append the start of the best open reading frame for each unigene item to the header line
			-- of the item, and then remove line breaks from the following data, thereby reducing each Unigene
			-- record to a header line followed by a single data line in the resulting '.withoffs' files

procedure filter_psl(file_name);			-- filter one of Toto's .psl output files 			
	
	print("Starting filter: ",file_name," ",time());
	ihandle := open(file_name,"TEXT-IN");					-- open the input file
	ohandle_1 := open(file_name + ".1","TEXT-OUT"); 		-- open all the output files, xcept the summary file
	ohandle_2 := open(file_name + ".2","TEXT-OUT");
	ohandle_n := open(file_name + ".n","TEXT-OUT");
	badhandle := open(file_name + ".bad","TEXT-OUT");

	sec_buffer := []; sec_count := 0;			-- will collect lines by sections
	
	histo_exon_count := [];						-- will also generate a histogram of exon lengths

	while not eof() loop						-- loop over all input lines

		geta(ihandle,x); if x = OM then exit; end if;
		
		if x(1) = ">" then 						-- a new section is starting
		
			if start_line /= OM then 			-- print out prior section info
				process_section_buffer(sec_buffer); 			-- find the best line in the section, and rite it out
			end if;
			
			sec_count +:= 1; sec_buffer := [start_line := x];		-- start a new section with the new header line
			
--if sec_count > 10 then stop; end if;
			if sec_count mod 10000 = 0 then print(sec_count," ",time()); end if;		-- indicate progress in section count
		else
			sec_buffer with:= x;									-- otherwise just collect the line into the current section
		end if; 
		
	end loop;

	process_section_buffer(sec_buffer); 							-- process the final section
	
	print("histo_exon_count: ",[str(j) + ":" + x: x = histo_exon_count(j) | x /= OM]);		-- print the exon size histogram
	print("Done phase 1: ",sec_count," ",time()); 					-- note end of first processing phase

	close(ihandle); close(ohandle_1); close(ohandle_2); close(ohandle_n); close(badhandle);		-- close all output files

				-- now reread the ohandle_n file and sort it by the biggest_internal_buff,overall_length, number of exons prefix
				-- attached by the process_section_buffer routine
	lines := get_lines(file_name + ".n");
	line_pairs := [[lines(j),lines(j + 1)]: j in [1,3..nl :=#lines]];			-- group as pairs

 	ohandle_n := open(file_name + ".n","TEXT-OUT");					-- reopen the '.n' output file (Rnas with at least 3 introns)
	ohandle_summary := open(file_name + ".smry","TEXT-OUT");		-- open the summary output file
 	
 	line_triple_tup := []; 					-- will collect line triples, as described below
 	
 	for [hl,nextl] = line_pairs(jj) loop								-- iterate over the line pairs

		data_line_pieces := breakup(nextl,"\t");					-- break the data line into its tab-separated pieces

		if (strand := data_line_pieces(DNA_strand)) = "-" then 				-- if introns are on the negative strand, reverse order of exon list
			n_exons := #(exons := breakup(data_line_pieces(19),","));
			exons := join([exons(j): j in [n_exons,n_exons - 1..1]],",");
		elseif (exons := data_line_pieces(19)) /= OM then 
			rmatch(exons,",");		-- otherwise just drop final comma from exon list
		else
			continue;				-- since section is bad
		end if; 

					-- get starting positions if exons on genome, which will be subtracted 
					-- to get length of genome pieces containing exons 
		intron_plus_exon_lengths_tup := [unstr(x): x in (starts_on_genome := breakup(data_line_pieces(21),",")) | x /= ""];
		exon_tup := [x: x in breakup(data_line_pieces(19),",")];		-- get lengths of exons

					-- calculate intron lengths as differences of sucessive starts minus exon lengths
		nelt := #(intron_lengths_tup := [intron_plus_exon_lengths_tup(j + 1) - intron_plus_exon_lengths_tup(j) - unstr(exon_tup(j)): 
																	j in [1..ngaps := #intron_plus_exon_lengths_tup - 1]]);
		
		if strand  = "-" then 
							-- reverse list of intron lengths if on negative strand
			intron_lengths_tup := [intron_lengths_tup(j): j in [nelt,nelt - 1..1]]; 
		end if;
									
		intron_lengths_stg := join([str(x): x in intron_lengths_tup],",");		-- convert to a comma-separated string

		hl_pieces := segregate(hl," /gb="); 				-- extract the genbank name
		genbank_name := if exists piece = hl_pieces(pn) | piece = " /gb=" then  hl_pieces(pn + 1)?"????" else "????" end if;
		
			-- assemble the output summary line: biggest_internal_buff,overall_length, number of exons tag, then exon list, 
			-- then strand indication, then Rna genbank name, the intron lengths
 --if hl = OM then print("hl = OM: ",jj); end if;
 		output_summary_line := tags(hl) + "\t" + exons + "\t" + strand + "\t" + genbank_name + "\t" + intron_lengths_stg;

			-- add the three information items just calculated, with a calculated prefix to force lexicographic sorting, to the developing
			-- line triple list
 		line_triple_tup with:= 	[prefix(hl),hl,nextl,output_summary_line];

	end loop;
 	
 	line_triple_tup := merge_sort(line_triple_tup); 		-- sort the line triples
 		
				-- now loop over the sorted line triples and print our the header and tuple data in sorted order	
	for [-,hl,nextl,output_summary_line] in line_triple_tup loop
		printa(ohandle_n,hl); printa(ohandle_n,nextl);				-- rewrite the '.n' output file, now in sorted order
		printa(ohandle_summary,output_summary_line); 				-- write the summary file
	end loop;
	
	print("Done phase 2: ",nl/2," ",time()); 			-- emit phase 2 complete message

end filter_psl;

procedure tags(tagged_header_line);		-- get prepended prefix of tagged header line
	pieces := breakup(tagged_header_line,"\t"); return join(pieces(2..5),"\t"); 
end tags;

procedure prefix(tagged_header_line);		-- get prepended prefix of tagged header line and convert to integer for sorting

	pieces := breakup(tagged_header_line,"\t"); 
	return unstr(pieces(2)) * 100000000000 + unstr(pieces(3)) * 1000000 + unstr(pieces(4)) * 10000 + unstr(pieces(5));

end prefix;

procedure process_section_buffer(sec_buffer); 			-- find the longest match among those reported			
			-- finds best one of the data lines reported, eliminates false introns from this line (if any),
			-- prepend sorting tag to the correspoinding header line, and writes pair of lines
			-- to one of the files bad, .1,.2. or .n
			
	head_lin := o_head_lin := sec_buffer(1); rspan(head_lin," \t");  		-- get the header line and clean it up
	full_len := rbreak(head_lin,"="); full_len := unstr(full_len);  		-- get the full length of the mrna
--	print(sec_buffer(1));
	best_line := OM;			-- best data line, to be found
	
	best_len := 0 max/[match_len(line): line in sec_buffer(2..)];
						   		--find the maximum match length of any of the data lines in the section
	must := exists best_line in sec_buffer(2..) | match_len(best_line) = best_len;
					-- and the data line having this maximum match length
--if best_len = OM then print("sec_buffer: ",sec_buffer);	end if;
	tag_showing_missing := " [" + (missing := full_len - best_len) + "]";		-- prepare a tag showing the missing length

	if best_line /= OM then 				-- if the section contains at least one data line
		best_pieces := breakup(best_line,"\t"); 		-- break it nto its tab-delimited pieces

					-- get starting positions of exons on genome, which will be subtracted 
					-- to get length of genome pieces containing exons 
		intron_plus_exon_lengths_tup := [unstr(x): x in (starts_on_genome := breakup(best_pieces(21),",")) | x /= ""];
		exon_tup := [x: x in breakup(best_pieces(19),",")];		-- get lengths of exons

					-- calculate intron lengths as differences of sucessive starts minus exon lengths
		nelt := #(intron_lengths_tup := [intron_plus_exon_lengths_tup(j + 1) - intron_plus_exon_lengths_tup(j) - unstr(exon_tup(j)): 
																	j in [1..ngaps := #intron_plus_exon_lengths_tup - 1]]);

					-- where the intron lengths are small, add the surrounding exon lengths and the intron lengths
					--  together to get a new series of exon lengths and a reduced series of intron lengths
		new_exon_tup := [unstr(exon_tup(1))]; new_starts_on_genome := [starts_on_genome(1)];

		for x = intron_lengths_tup(j) loop 
			if x < 10 then 			-- combine following intron with last stacked
								-- add the lengths of the exons being joined, pus that of the intervening short intron
				lnet := #new_exon_tup;
				new_exon_tup(lnet) := new_exon_tup(lnet) + x + unstr(exon_tup(j + 1));
			else		-- take the following exon and its start
				new_exon_tup with:= unstr(exon_tup(j + 1)); 
				new_starts_on_genome with:= starts_on_genome(j + 1);
			end if;	
		end loop;			-- end of false intron elimination
		
		best_pieces(18) := str(exon_count := #new_exon_tup);			-- revise the number of introns
		best_pieces(19) := join([str(x): x in new_exon_tup],",");		-- revise the list of exon lengths
		best_pieces(21) := join(new_starts_on_genome,",");				-- revise the list of starts
		best_line := join(best_pieces,"\t");
	
		if exon_count?0 > 2 then
				 			-- prepend sorting information to lines with at least 3 introns: biggest middle intron len, biggest middle intron posn, n-introns,full-len,
			biggest_exon_len := max/ (nextup_middle := new_exon_tup(2..nnet := #new_exon_tup - 1)); 
			must := exists exl = nextup_middle(bestexon_pos) | exl = biggest_exon_len;
			strand := best_pieces(DNA_strand); if strand notin "+-" then abort("strand trouble: " + strand); end if;
			if strand = "-" then bestexon_pos := nnet - bestexon_pos; end if;
			o_head_lin := o_head_lin(1) + "\t" + biggest_exon_len + "\t" + (bestexon_pos + 1) + "\t" + exon_count
																	 + "\t" + best_len + "\t" + o_head_lin(2..);
		end if;
		
		histo_exon_count(exon_count) := (histo_exon_count(exon_count)?0) + 1;		-- histogram number of exons
		
--print("intron_len_list: ",intron_len_list); if sec_count > 10 then stop; end if;
	end if;
	
	
	use_handle := if best_line = OM then badhandle elseif exon_count = 1 then ohandle_1 		-- determine the target file to be written
							elseif exon_count = 2 then ohandle_2 else ohandle_n end if;		-- if missing > 100 then badhandle else

	printa(use_handle,o_head_lin + tag_showing_missing);		-- write the modified header line
	if best_line /= OM then printa(use_handle,best_line); end if;	-- write the data line if any
	
end process_section_buffer;

procedure match_len(line); span(line,"\t "); lenn := break(line,"\t "); return unstr(lenn); end match_len;

procedure peek(file_name,strt,n);					-- peek at start of specified file	 			
	ihandle := open(file_name,"TEXT-IN");
	for j in [1..strt - 1] loop
		geta(ihandle,x); 
	end loop;
	for j in [1..n] loop
		geta(ihandle,x);  print(x);
	end loop;
end peek;

procedure index_ncbi_active_list(file_name);	-- index an NCBI active list, producing and writing an index flat
	-- we index the NCBI active list by hashing the GB names of its elements down to a 20 bit hash 
	-- (thereby producing an index with aabout 1 million 4-byte entries), and binsorting the entries using
	-- this same hash. The index gives the end-byte of the range of values with a given hash.
	const prime := 1048573;		-- largest prime less than 2**20
	
						-- open two sets of 8 files to be used in binsorting
	junk :=  [open(file_name + ".jk." + str(j),"TEXT-OUT"): j in [1..7]];		-- create and clear the auxiliary files
	for h in junk loop close(h); end loop;
	
					-- set up vectors of infile handles, outfile handles, and corresponding file names
	inhandles := [open(file_name,"TEXT-IN")] + [open(file_name + ".jk." + str(j),"TEXT-IN"): j in [1..7]];
	inames := [file_name] + [file_name + ".jk." + str(j): j in [1..7]];			-- names of the input files

	ohandles := [open(ix_name := file_name + ".ix","TEXT-OUT")] + [open(ix_name + "." + str(j),"TEXT-OUT"): j in [1..7]];
					-- this uses the index-file-to-be temporarily as one of the binsorting files
	onames := [ix_name] + [ix_name + "." + str(j): j in [1..7]];			-- names of the output files
	
					-- perform the binsorting operation
	for j in [0,3..21] loop 		-- process 3-bit nibbles in order of increasing significance (but order doesn't really matter)
		[inhandles,inames,ohandles,onames] := merge_by_3_bit_groups(inhandles,inames,ohandles,onames,2 ** j); 
		print("Done mergepass ",j," ",time()); 
	end loop;
					-- since this involved an even number of bin merges the sorted file is back in its original position
					-- all the other files are now empty and should be removed
	
	for j in [2..8] loop close(inhandles(j)); end loop;		-- close the files that are no longer needed
	for j in [1..8] loop close(ohandles(j)); end loop;
	
	index_flat := flat_create(6 * prime);			-- create the index flat (initially zeroes)
	handle := inhandles(1);							-- get the read handle of the surviving file
	geta(handle,x);	ox := x;						-- read its first line
	front_x := break(ox,","); prior_hash := ncbi_hash(front_x); 					-- get its hash
print("first entry is: ",x," ",#x," prior_hash: ",prior_hash," ",6 * prior_hash + 1);
	characters_passed := 0;					-- running count of characters passed
	length_prior_group := 0;				-- running count of characters passed in group just starting
debug_count := 0; 
	
	while x /= OM loop 
	
		nox := #x; front_x := break(x,",");
		if (this_hash := ncbi_hash(front_x)) /= prior_hash then 		-- current element starts a new group
			flat_set_slice(index_flat,6 * prior_hash + 1,flat_from_setl(end_and_len := int_to_4(characters_passed) + int_to_2(length_prior_group)));
--if (debug_count +:= 1) < 10 then print("index entry for group ended by ",front_x," written to: ",6 * prior_hash + 1," ",hexify(end_and_len)," ",characters_passed," ",length_prior_group); end if;
			prior_hash := this_hash; 			-- update the running quantities
			length_prior_group := 0;			-- initialize for the next group
						-- write the index entry in the 4-byte field indicated by the hash; 
						-- it contains the position in the sorted file of the last character of the group 
						-- of elements with a common hash, plus the byte_length of this group
		end if;
		
		characters_passed +:= (nch := nox + 1);	length_prior_group +:= nch;		-- running count of characters passed; allow for eol
		geta(handle,x);	

	end loop;			
		
	flat_set_slice(index_flat,6 * prior_hash + 1,flat_from_setl(int_to_4(characters_passed) + int_to_2(length_prior_group)));
								 -- write the final index entry

	close(open(ix_name,"TEXT-OUT"));			-- make sure that the index file is empty
	flat_file_put(ix_name,1,index_flat);		-- write out the index flat 
	
	print("Done indexing ",file_name," ",time()); 
	
end index_ncbi_active_list;

procedure nibbles(int); return [(int / 2**j) mod 8: j in [18,15..0]]; end nibbles;

procedure merge_by_3_bit_groups(inhandles,inames,ohandles,onames,divis); 				-- performs a single bin redistibution step
--print("merge_by_3_bit_groups: ",time());	
debug_count := 0;
num_in_bin_1 := 0;
	for ih in inhandles loop			-- process the input bins in order

		geta(ih,x);				-- distribute the elements of this bin to the output bin, using the 
	
		while (ox := x) /= OM loop 
if ((debug_count +:= 1) mod 1000000) = 0 then print(debug_count," ",time()); end if;
			front := break(ox,",");	
			distrib_key := (((hf := ncbi_hash(front)) / divis) mod 8) + 1;			-- calculate the target bin; 8 gives lower 3 bits
--if distrib_key = 1 then if (num_in_bin_1 +:= 1) < 10 then print(divis," ",distrib_key," hf: ",hf," ",hf/divis," ",front," ",nibbles(ncbi_hash(front))); end if; end if;
			printa(ohandles(distrib_key),x);
			geta(ih,x);				-- and on to the next element 
		end loop;			
		
	end loop;
	
	for ih in inhandles loop close(ih); end loop;		-- close the files preparatory to reopening them with the opposite character
	for ih in ohandles loop close(ih); end loop;		-- close the files preparatory to reopening them with the opposite character
	ohandles := [open(fname,"TEXT-IN"): fname in onames];			-- repoen the files with characters reversed
	inhandles := [open(fname,"TEXT-OUT"): fname in inames];
	
	return [ohandles,onames,inhandles,inames];			-- the former input files become outputs and vice versa
	
end merge_by_3_bit_groups;

procedure gb_record(gb_record_name);			-- fetch a genbank record by its name
	
--	use_file := "Macintosh HD:gbacclist.test";					-- open the sorted gb activelist index file
	use_file := "Macintosh HD:GbAccList.0912.2004";					-- open the sorted gb activelist index file
	
	if gb_active_ix_flat = OM then 			-- read in the flat if it has not already been read
		the_size := fsize(handle := open(ix_name := use_file + ".ix","RANDOM")); close(handle);
		gb_active_ix_flat := flat_file_get(ix_name,1,the_size);
	end if;

	if gb_active_file_handle = OM then 			-- open the sorted index file if it has not already been opened
		gb_active_file_handle := open(use_file,"RANDOM");
	end if;

	end_and_len := flat_to_setl(flat_slice(gb_active_ix_flat,st := 6 * ncbi_hash(gb_record_name) + 1,st + 5));
						-- the end and length of the group of index entries with this hash

	if (rec_end := i4_to_i(end_and_len(1..4))) = 0 then return OM; end if;
	rec_len := i2_to_i(end_and_len(5..6));		-- since index entry is empty

	gets(gb_active_file_handle,rec_end - rec_len + 1,rec_len,group_of_entries);
	bg := breakup(group_of_entries,",\n\r"); 
	if not (exists piece = bg(j) | piece = gb_record_name) then return OM; end if;
--print("web fetch ",bg(j + 2));	
	return get_by_gi(bg(j + 2));			-- use the internal accession (gi) number to fetch the record

end gb_record;

procedure ncbi_hash(stg); 	-- hash an ncbi GB name
	
	const prime := 1048573;		-- largest prime less than 2**20
			-- the hash of an element will be its value as an integer, mod this prime
	
	numval := 0; 
	
	for c in stg loop numval := (256 * numval + abs(c)) mod prime; end loop; 
	return numval;
	
end ncbi_hash;

procedure cut_by_enzymes(flat_stg,enz_name_list); 	-- return the list of pieces into which a string of bases 
													-- is cut by a specified list of restriction enzymes, 
													-- applied in the specified order, return triples [enzA,substg,enzB]
	
	if is_string(flat_stg) then flat_stg := flat_from_setl(flat_stg); end if;	-- allow Setl string argument
	
	trips := [["Begin",stg,"End"]];		-- start a list of triples
	
				-- allow a single enzyme to be supplid as a simple string
	if is_string(enz_name_list) then enz_name_list := [enz_name_list]; end if;

	for enz in enz_name_list loop
		
		cut_template := restriction_enzymes_data(enz);
		trips := +/[cut_by_an_enzyme(tag1,stg,tag2,enz,cut_template): [tag1,stg,tag2] in trips]; 
	
	end loop;
	
	return [[tagA,flat_to_setl(fs),tagB]: [tagA,fs,tagB] in trips];			-- convert flats to SETL
	
end cut_by_enzymes;

procedure cut_by_an_enzyme(tag1,flat_stg,tag2,enz_name,cut_template);  -- cut by a single enzyme

	-- This returns a tuple of tagged pieces [tagA,piece,tagB], where in internal positions 
	-- enz_name is used as the tag, and in the end positions tag1 and tag2 are used
					-- expand all the abbreviated characters in the cut template 
 	cut_template := expand_template_abbreviations(cut_template);

	if not (exists c = cut_template(j) | c = "^") then 		-- something wrong, but go ahead anyhow
		cut_pos := ^cut_template/2;
	else
		cut_pos := j;	-- so after suppression of the cut marks the cut position is before the j-th character of the match template
	end if;
	
	nct := #(cut_template := suppress_chars(cut_template,"^")); 
			-- for now, ignore the possibility of more than 1 cut mark
 				
 	cut_tup := [if x(1) = "N" then #x else x end if: x in segregate(cut_template,"N")];
 				-- to accelerate matching, represent substrings of 'N' by their integer length
				-- and then build an extraction template to be used in the search which follows
				-- the extraction template codes are:
				-- 		0: skip field (absent in output)
				--		10: no conversion: return raw string
	template_string_pieces := [];

	flat_cut_template := 		-- flat_cut_template is set up for use by flat_slices_to_setl; see the codes above 
		flat_from_setl("" +/ [if is_integer(x) then char(x) + "\x00" else char(#x) + "\x0a" end if: x in cut_tup]);
	
	match_sites := find_match_sites(flat_stg,flat_cut_template,template_string_pieces,nct);
				-- returns the list of all starting points of matches to the template

end cut_by_an_enzyme;

procedure find_match_sites(flat_stg,flat_cut_template,template_string_pieces,nct);
				-- returns the list of all starting points of matches to the template
	
	match_starts := [];			-- will build and return a list of match starts
	
	for j in [0..flat_len(flat_stg) - nct] loop
 
 		pieces_list := flat_slices_to_setl(flat_string,j,flat_cut_template);		-- extract the pieces relevant to the match
		
		for piece = pieces_list(j)  loop
			
			tempiece := template_string_pieces(j);		-- get the corresponding tmplate piece
			
			piece_was_matched := true;		-- assume that the following match loop will succeed
	
			for c = piece(k) | c /= (tc := tempiece(k)) loop 
				
				case tc

		-- 3.2. Purine (adenine or guanine): R
		-- 3.3. Pyrimidine (thymine or cytosine): Y
		-- 3.4. Adenine or thymine: W
		-- 3.5. Guanine or cytosine: S
		-- 3.6. Adenine or cytosine: M
		-- 3.7. Guanine or thymine: K
		-- 3.8. Adenine or thymine or cytosine: H
		-- 3.9. Guanine or cytosine or thymine: B
		-- 3.10. Guanine or adenine or cytosine: V
		-- 3.11. Guanine or adenine or thymine: D
		-- 3.12. Guanine or adenine or thymine or cytosine: N
				
					when "A","C","T","G" => piece_was_matched := false; exit;
					when "R" => if c /= "A" and c /= "G" then piece_was_matched := false; exit; end if;
					when "Y" => if c /= "T" and c /= "C" then piece_was_matched := false; exit; end if;
					when "W" => if c /= "A" and c /= "T" then piece_was_matched := false; exit; end if;
					when "S" => if c /= "C" and c /= "G" then piece_was_matched := false; exit; end if;
					when "M" => if c /= "C" and c /= "A" then piece_was_matched := false; exit; end if;
					when "K" => if c /= "T" and c /= "G" then piece_was_matched := false; exit; end if;
					when "H" => if c = "G" then piece_was_matched := false; exit; end if;
					when "B" => if c = "A" then piece_was_matched := false; exit; end if;
					when "V" => if c = "T" then piece_was_matched := false; exit; end if;
					when "D" => if c = "C" then piece_was_matched := false; exit; end if;

				end case;
					
			end loop;
		
			if not piece_was_matched then exit; end if;		-- if piece match failed, then entire match fail; go on to next j
	
		end loop;
	
		if piece_was_matched then match_starts with:= j; end if;		
					-- when match is successful, note the starting position of the match
		
	end loop;
	
	return match_starts;
	
end find_match_sites;

procedure expand_template_abbreviations(cut_template);	-- look for integers, possibly prceded  by a string to be expanded

	if #(pieces := segregate(cut_template,"0123456789")) = 1 then return cut_template; end if;		-- nothing to expand
	
	expanded := [];

	for piece = pieces(j) | piece /= "" loop
		
		if piece(1) notin "0123456789" then 

			expanded with:= piece; 

		else							-- we have an integer

			pval := unstr(piece);		-- get its numerical value
			
			prior_piece := expanded(j - 1);

			if prior_piece(npp := #prior_piece) = ")" and (exists k in [npp,npp - 1..1] | prior_piece(k) = "(") then 
													-- expand everything back to the opening paren.
				expanded(j - 1) := prior_piece(1..k - 1) + pval * prior_piece(k + 1..npp - 1);

			else		-- just expand the last character

				expanded(j - 1) := prior_piece(1..npp - 1) + pval * prior_piece(npp);

			end if;

		end if;

	end loop; 		
	
	return "" +/ expanded;
	 
end expand_template_abbreviations;

procedure survey_ncbi_active_list(file_name);

	ihandle := open(file_name,"TEXT-IN");
	print(ohandle := open(file_name + ".o","TEXT-OUT"));
	geta(ihandle,x); num_read := 0; nshort := num_NP_read := 0;
	printa(ohandle,x); 
	min_digs := 100000;			-- mininum number of digits in gb identifier
	
	while x /= OM loop
		
		gb := break(x,","); digs := rspan(gb,"0123456789"); min_digs min:= (nd := #digs);
		if nd = 5 then nshort +:= 1; end if;		-- count the nuber of short identifiers
		if (num_read +:= 1) mod 1000000 = 0 then print(num_read," ",min_digs," ",nshort," ",time()); end if;
		if num_read > 4000000 then exit; end if;			-- for getting a small initial slice for testing
		geta(ihandle,x); 
		printa(ohandle,x); 
	end loop;
	
	print(num_read," ", time());
	
end survey_ncbi_active_list;

procedure get_section(file_name,sec_num);		-- get specified section of pre-indexed file
		
		-- read the index flat if it has not been read already
--	handle := open(file_name,"RANDOM"); 
	
end get_section;
	
procedure merge_psl_summaries(file_name_list,out_name);			-- merge a list of psl summary files 			
	
	ohandle := open(out_name + ".smerge","TEXT-OUT"); 		-- open all the output files, xcept the summary file
	 
	file_name_list := breakup(breakup(file_name_list,","),"#");
	print("Starting merge: ",file_name_list," ",time());
	file_list := [get_lines(file_name + ".psl.smry"): [file_name,-] in file_name_list];
	file_len_list := [#t: t in file_list];
	ix_list := #file_list * [1];

	while (smallest := min/ [sumry_prefix_val(sumry_line): ix = ix_list(j) | (sumry_line := file_list(j)(ix)) /= OM]) /= OM loop
		
		if not (exists ixj = ix_list(j) | ((sumry_line := file_list(j)(ixj)) /= OM and sumry_prefix_val(sumry_line) = smallest)) then
			abort("something wrong");
		end if;
		 
		printa(ohandle,file_name_list(j)(2) + "\t" +  sumry_line);			-- output the smallest summary line with attached species identifier
		ix_list(j) +:= 1;													-- increment 
if (x := (0 +/ix_list)) mod 10000 = 0 then print(x); end if;		
	end loop;
	
	print("Done merge: ",time());
	
end merge_psl_summaries;
	
	procedure actual_data(rec_triple); 	-- read a record using its triple
		if rec_triple = OM then return "Bad triple: " + str(rec_triple); end if;
		[rec_start,rec_len,rec_handle] := rec_triple;
		gets(rec_handle,rec_start,rec_len,rec_data);
		return rec_data;
	end actual_data;
	
	procedure find_genbank_id(rec_start,rec_len,file_handle); 	-- read a genbank id from the header line
if show_insertion_debug = true then print("rec_start: ",rec_start," rec_len: ",rec_len," sum: ",rec_start + rec_len - 1," filesize: ",fsize(file_handle)); end if;
		gets(file_handle,rec_start,rec_len - 10,rec);		-- get the record referenced (all but last 10 characters, which will surely be base data)
		head_line := break(rec,"\n"); 					-- break out its head line
		pieces := segregate(head_line," /gb=");

		if not (exists piece = pieces(j) | (piece = " /gb=" or ((np := #piece) >= 4 and piece(np - 3..) = "/gb="))) then 
			print("********** Illformed section header line: ",head_line," ",pieces); return OM;
		end if;
		
		return pieces(j + 1);
	end find_genbank_id;

	procedure build_unigene_index(list_of_unigene_filenames,saved_index_name);
					-- build indices for the specified unigene files. This routine operates on the raw unigene files,
					-- which must be in the folder located by the 'to_raw_unigene_prefix' path
					-- statistics on number of objects bumped at levels shown (for mouse index): [17153, 1828, 52]
	
		file_list := breakup(list_of_unigene_filenames,",");		-- get the list of files from their common string
							-- call the indexing routine proper, which will return a rix object
		rix_object := tag_and_index_ug_files([to_raw_unigene_prefix + fn: fn in file_list]); 
	
		save_and_close(rix_object,saved_index_name);		-- write rix to the specified external file and close it
	
	end build_unigene_index;
	
	procedure tag_and_index_ug_files(file_name_tup); 	-- prepare a unigene file index also showing the starting position of the cds 
														-- this routine can prepare a single index for multiple files
														-- Note that this routine returns an index object, leaving it 
														-- to the calling routine (see build_unigene_index) to save the index object
 
 			-- iterate over the file name tuple, getting an estimate of the total index size which will be required
 			-- for the overall index. estimate about 800 bases per gene, which is a bit under average

		est_num_genes := 0;			-- will be totaled
		
		for file_name in file_name_tup loop
			est_num_genes +:= fsize(ihandle := open(file_name,"RANDOM"))/800;		-- open the input file
			close(ihandle);			-- release the input file for later use
		end loop;

		the_rix := new_rix(est_num_genes,find_genbank_id);	-- create a new index object for records 
print("Created index for storing ",est_num_genes,"genes  ",time());	
		
		debug_count := 0;		-- for tracking progress and counting total number of genes indexed

		for file_name = file_name_tup(jj) loop
			
print("indexing file: ",file_name," ",time());	

			close(open(revised_file := file_name + ".withoffs","TEXT-OUT"));		-- clear the revised file which will be written
			ohandle := open(revised_file,"RANDOM");		-- open the output file which will be used to store the revised file version
														-- Note that it is the reviced file version (the '.withoffs') that is indexed.
			
			section := OM; section_indices := [];		-- will collect list of section indices
			
			ihandle := open(file_name,"TEXT-IN");		-- open the input file  input file to be read
			geta(ihandle,line); 						-- read its first line
	
			sec_end := 0;		-- iitialize position of last character of sections thus far written to the revised file
			
			while line /= OM loop			-- read all the lines of the input file
			
				if (sec_starting := #line > 0 and line(1) = ">") then			-- a new section is starting
--print("new section: ",line);
					if section /= OM then			-- analyze the prior section
						head_line := section(1); section_data := "" +/ section(2..);
						sat := cds_start_and_translation(section_data);		-- translate to best protein and get start of translation
						head_line +:= (" [" + str(sat(1)) + "]");			-- attach the mRNA position of the translation start to the head_line 
										-- Note that it is this added data that forces us to rewrite our initial input files, creating 
										-- tehir 'revised' versions.
								
						section := head_line + "\n" + section_data + "\n";  -- the new section consists of the revised head-line plus 
																			-- the following data, which is condensed to a single long string
						sec_start := sec_end + 1;							-- new section starts 1 character past the end of the old
						
						sec_end +:= (sec_len := (#head_line + #section_data + 2));		
							-- get the total section length, including line-end characters and calculate
							-- the section end in the new file to be written (note that each new section consists of just two lines)
	 					
							-- write this modified section to the file .withoffs and put the corresponding index entry into the rix.
						puts(ohandle,sec_start,section);
--if jj > 1 then show_insertion_debug := true; end if;	 
						insert_record(the_rix,sec_start,sec_len,revised_file,ohandle);		-- put the index entry into the rix
	 								-- supply ohandle as last parameter since the '.withoffs' file is being built concurrently with indexing.
--print("gbid: ",find_genbank_id(sec_start,sec_len,ohandle)," ",section);
	 
						if (debug_count +:= 1) mod 1000 = 0 then print(debug_count," ",time()); end if;		-- track progress
 
 					end if;
					
					section := [line];			-- start the nex section with the header line which terminated the old
	
	 			else				-- the previous section continues
--print("collect: ",line);	  		-- collect the next line into the developing section
				
	 				section with:= line;
				
	 			end if;
				
				geta(ihandle,line);				-- read in one more lone, and loop back to process it.
--print("read: ",line);

			end loop;
--print("semifinal section data: ",sec_start," ",sec_len); show_insertion_debug := true;
							-- capture the final section if any
			if section /= OM then			-- analyze the final section
	
				head_line := section(1); section_data := "" +/ section(2..);
				sat := cds_start_and_translation(section_data);		-- translate to best protein and get start of translation
				head_line +:= (" [" + str(sat(1)) + "]");			-- attach the offset to the head_line 
				
				section := head_line + "\n" + section_data + "\n\n\n"; 		-- will write two padding ened-line characters
				sec_start := sec_end + 1;							-- new section starts 1 character past the end of the old
				
				sec_end +:= (sec_len := (#head_line + #section_data + 2));		
					-- get the total section length, including line-end characters and calculate
					-- the section end in the new file to be written (note that each new section consists of just two lines)
					
					-- write this modified section to the file .withoffs and put the corresponding index entry into the rix.
				puts(ohandle,sec_start,section);	 
	
				insert_record(the_rix,sec_start,sec_len,revised_file,ohandle);		-- put the final index entry for this file into the rix
								-- supply ohandle as last parameter since the new file is being built concurrently with indexing

			end if;
			
			print("Done Indexing file: ",file_name," ",time());
			close(ihandle); close(ohandle); 			-- close both of the files being used
			end_external_use(the_rix,revised_file);		-- notify the rix_object that a file that was being used externally has been closed

		end loop;			-- end of iteration over files
			
		print("Done all Indexing: ",time()," est_num_genes: ",est_num_genes," actual number: ",debug_count," bumped at levels shown: ",debug_numbumped);
		
		return the_rix;
	
	end tag_and_index_ug_files;

procedure sumry_prefix_val(sumry_line);			-- get sorting value of summary line summary files 			
	sumry_pieces := breakup(sumry_line,"\t"); 
	return unstr(sumry_pieces(1)) * 100000000000 + unstr(sumry_pieces(2)) * 1000000 + unstr(sumry_pieces(3)) * 10000 + unstr(sumry_pieces(4));

end sumry_prefix_val;

procedure print_group(group_of_indices);			-- print a multi-line group for inspection
	tag := if #{lines(ix)(1): ix in group_of_indices} > 1 then "***" else "" end if;

	printa(ohandle,"\t\t\t\t",time()); printa(ohandle,tag + " Group of ",#group_of_indices," mRNAs"); 
--print("Group of ",#group_of_indices," mRNAs");

	for ix in group_of_indices loop 

		line := lines(ix);
		annotation := header_line(actual_data(get_record(rix_obj,genbank_id(line))));
		 
		if annotation = OM then annotation := "NO ANNOTATION [0]"; end if;

		rspan(annotation," \t"); annotation_of_start := rbreak(annotation," \t"); 
		nprinta(ohandle,show_moduli(line + "\t" + annotation_of_start),"\t"); 
 
 		printa(ohandle,annotation);
		
	end loop;

end print_group;

procedure print_protein_group(group_of_indices);			-- print a multi-line group for inspection
	tag := if #{lines(ix)(1): ix in group_of_indices} > 1 then "***" else "" end if;

--print("Group of ",#group_of_indices," mRNAs");
	group_of_indices := [x: x in group_of_indices]; 		-- force to tuple
	bases := [data_line(actual_data(get_record(rix_obj,genbank_id(lines(ix)))))?"NO BASES": ix in group_of_indices];
	
	if #[x: x in bases | x /= "NO BASES"] = (ngi := #group_of_indices) and ngi = 2 then		-- if the group is a pair, do alignment
if (alignment_count +:= 1) mod 10 = 0 then print("aligned pairs: ",alignment_count," ",time()); end if;
		if (aligned_stgs := align_by_mers(cds_start_and_translation(bases(1))(2),cds_start_and_translation(bases(2))(2),3)) = OM then 
			printa(ohandle,"\t\t\t\t",time()); printa(ohandle,tag + "\nproteins for group of ",#group_of_indices," mRNAs: FAIL TO ALIGN");

			for base_stg in bases loop 
		  		printa(ohandle,trans := if base_stg = "NO BASES" then "NO BASES" else cds_start_and_translation(base_stg)(2) end if);
			end loop;

			return;		-- done with this case
 
 		end if;

		printa(ohandle,"\t\t\t\t",time()); printa(ohandle,tag + "\nproteins for group of ",#group_of_indices," mRNAs: ALIGNED");

		for base_stg in aligned_stgs loop rany(base_stg,"\n\r");  printa(ohandle,base_stg); end loop; 

		return;		-- done with this case
 
 	end if;
	
	printa(ohandle,"\t\t\t\t",time()); printa(ohandle,tag + "\nproteins for group of ",#group_of_indices," mRNAs"); 
	for base_stg in bases loop 
		
  		printa(ohandle,trans := if base_stg = "NO BASES" then "NO BASES" else cds_start_and_translation(base_stg)(2) end if);
		
	end loop;

end print_protein_group;

procedure header_line(data);			-- get header_line portion of data record
	if not is_string(data) then return OM; end if;
	hl := break(data,"\n"); return hl;
end header_line;

procedure data_line(data);			-- get data_line portion of data record
	if not is_string(data) then return OM; end if;
	hl := break(data,"\n"); match(data,"\n");  return case_change(data,"lu");
end data_line;

procedure genbank_id(line);			-- get genbank id from .smerge line
	return breakup(line,"\t")(8);
end genbank_id;

procedure remove_lines(group_of_indices);			-- remove a group of lines from the lines_with_trips map

	for line_ix in group_of_indices | (nel := #(exons_in_line :=  breakup(breakup(lines(line_ix),"\t")(6),","))) >= 5 loop
						-- iterate over all the lines in the group containing at least 5 exons
		 
		 trips_seen_in_this_line := {};			-- remove the line from the lines_with_trips image of al its interior triples
		 
		for j in [2..nel - 3] | (trip := join(exons_in_line(j..j + 2),","))  notin trips_seen_in_this_line loop
			trips_seen_in_this_line with:= trip;		-- note that already processed
			lines_with_trips(trip) less:= line_ix;
		end loop;

	end loop;
	
end remove_lines;
 
procedure triples_in_several(input_file);			-- look for cross-species triples; for use with remotely related species 			

	print("Starting triples_in_several analysis: ",time());

	output_file := input_file; rbreak(output_file,"."); output_file +:= "sevrl";
	ohandle := open(output_file,"TEXT-OUT");
	lines := get_lines(input_file);
	
	species := {};			-- will map each triple into the species in which it occurs

	for line = lines(lno) | (nex := #(exons := breakup(breakup(line,"\t")(6),","))) > 4 loop

		for j in [2..nex - 3] loop
			
			trip := join(exons(j..j + 2),",");
			species(trip) := (species(trip)?{}) with lno;
		end loop;

	end loop;

	lines_for_trips_in_mult := {[trip,lines_in]: [trip,lines_in] in species | #{lines(line_ix)(1): line_ix in lines_in} > 1};
	
--	printa(ohandle,trips_in_multi := );
	
	for [trip,lines_in] in lines_for_trips_in_mult loop 
		print("\n",trip); for line in lines_in loop print(lines(line)); end loop;
	end loop;

	print("Done triples_in_several analysis: ",time());

end triples_in_several;

procedure exon_length_compare_histogram(input_file1,input_file2);	-- histogram of exon lengths

	print("Starting comparative exon_length histograming: ",input_file1," and ",input_file2," ",time());

	len_histo1 := histo_lengths(input_file1); len_histo2 := histo_lengths(input_file2);
	print("number of interior triples in len_histo1: " + (+/ range(len_histo1))," number distinct triples: ",#domain(len_histo1));
	print("number of interior triples in len_histo2: " + (+/ range(len_histo2))," number distinct triples: ",#domain(len_histo2));
	
	print("\nMost frequent in histo 1: ",join(big_ones := [v + ":" + (-u): [u,v] in merge_sort([[-y,x]: [x,y] in len_histo1 | y < 3])],"  ")); 
	print("Number with frequency more than 100: ",#big_ones); print(); print();
	print("Most frequent in histo 2: ",join(big_ones := [v + ":" + (-u): [u,v] in merge_sort([[-y,x]: [x,y] in len_histo2 |  y < 3])],"  "));
	print("Number with frequency more than 100: ",#big_ones); print(); print();

	max_histo := {[x,y max (len_histo2(x)?0)]: [x,y] in len_histo1};
	print("Most frequent in max_of_both: ",join(big_ones := [v + ":" + (-u): [u,v] in merge_sort([[-y,x]: [x,y] in max_histo |  y < 3])],"  "));
	print("Number with frequency more than 100: ",#big_ones); 
	
	print("Done exon_length histograming: ",time());

end exon_length_compare_histogram;

procedure histo_lengths(input_file);			-- build histogram of interior exon lengths		
 
 	lines := get_lines(input_file);
	
	len_histo := {};
 
 	for line = lines(j) | line /= "" and line(1) /= ">" 
			and (nel := #(exon_list := breakup(breakup(line,"\t")(19),","))) > 4 loop 
		
		for j in [2..nel - 3] loop
			the_len := exon_list(j);
			len_histo(the_len) := (len_histo(the_len)?0) + 1;
		end loop;
		
--print(exon_list); if j > 1000 then stop; end if;
	end loop;

	return len_histo;
	
end histo_lengths;

procedure triple_occurences(input_file,rix_file);			-- look for multiply occuring triples and assemble/align groups			
	
	print("Starting triples analysis: ",input_file," ",time());

 	rix_obj := open_rix(rix_file,find_genbank_id);							-- this needs to be generalized; presently woks only for human and mouse
		
	output_file := input_file; rbreak(output_file,"."); output_file +:= "thist";
	ohandle := open(output_file,"TEXT-OUT"); 
	print("Number of input lines: ",#(lines := get_lines(input_file)));
--if exists line = lines(ix) | breakup(line,"\t")(6) = OM then print("OM: ",line," ",ix); end if;
	lines_with_ge6_exons := [ix: line = lines(ix) | #breakup(breakup(line,"\t")(6),",") > 5];
	lines_with_ge6_count := {}; 
	for ix in lines_with_ge6_exons loop
		tag := lines(ix)(1);
		lines_with_ge6_count(tag) := (lines_with_ge6_count(tag)?0) + 1;
	end loop;

	print("Lines with more than 5 exons: ",lines_with_ge6_count);
	
	lines_with_trips := lines_with_runs([1..#lines],3);			-- develop map of all triples into indices of their lines of occurence
					-- get the triples in the order of their frequency of occurence

	found_a_group := true;				-- set flag for while-loop which follows
	tot_numgroups_found := 0; tot_lines_in_groups_found := 0;
	
	while found_a_group loop 			-- keep iterating as long as groups of size at least 3 are being found
		
		found_a_group := false;			-- drop flag; will be raised if a group is found
		
 		frequent_triples_by_frequency := merge_sort([[-#lines_in,trip]: [trip,lines_in] in lines_with_trips]);
 					-- sort (or re-sort) the triples by the number of lines in which they appear
 
--print("frequent_triples_by_frequency: ",frequent_triples_by_frequency(1..50)," ",time()); 

		numgroups_found := 0;
		
		for [neg_freq,trip] in frequent_triples_by_frequency loop			-- loop over triples in order of decreasing frequency
	
			if (freq := -neg_freq) < 3 then exit; end if;
				
							-- form a sub_histogram of the lines containing trip
			lines_with_runs_in_this_set := lines_with_runs(lines_with_trips(trip),3);		-- note that all these lines must contain trip
			frequent_trips_this_set_by_frequency := 
					merge_sort([[-#lines_in,otrip]: [otrip,lines_in] in lines_with_runs_in_this_set | otrip/= trip]);
			
			if (nbg := #(biggest_group := lines_with_runs_in_this_set(most_frequent := (frequent_trips_this_set_by_frequency(1)?[0,{}])(2))?{})) < 3 then 
				continue; 			-- ignore 2-element groups which will be handled later
			end if;
		
			tot_lines_in_groups_found +:= nbg;
			numgroups_found +:= 1;
			print_group(biggest_group); 			-- report the lines in the group found
			print_protein_group(biggest_group);		
		
							-- remove all the summary lines in the group just reported from the lines_with_trips map
			remove_lines(biggest_group);		-- and now iterate, finding new groups as long as possible
	
		end loop;
		
		print("numgroups_found: ",numgroups_found," ",time()); tot_numgroups_found +:= numgroups_found;
		if numgroups_found > 0 then found_a_group := true; end if;
		
	end loop;

	print("End search for groups with more than 2 RNAs: ",time()," ",tot_numgroups_found," were found, containing ",tot_lines_in_groups_found," mRNAs");
			
			-- now we repeat the search, this time for pairs
	
	found_a_group := true;				-- set flag for while-loop which follows
	tot_numgroups_found := 0;
	
	while found_a_group loop 			-- keep iterating as long as groups of size at least 3 are being found
		
		found_a_group := false;			-- drop flag; will be raised if a group is found
		
 		frequent_triples_by_frequency := merge_sort([[-#lines_in,trip]: [trip,lines_in] in lines_with_trips]);
	 					-- sort (or re-sort) the triples by the number of lines in which they appear
	
		numgroups_found := 0;
		
		for [neg_freq,trip] in frequent_triples_by_frequency loop			-- loop over triples in order of decreasing frequency
	
			if (freq := -neg_freq) < 2 then exit; end if;
				
							-- form a sub_histogram of the lines containing trip
			lines_with_runs_in_this_set := lines_with_runs(lines_with_trips(trip),3);		-- note that all these lines must contain trip
			frequent_trips_this_set_by_frequency := 
					merge_sort([[-#lines_in,otrip]: [otrip,lines_in] in lines_with_runs_in_this_set | otrip/= trip]);
			
			if #(biggest_group := lines_with_runs_in_this_set(most_frequent := (frequent_trips_this_set_by_frequency(1)?[0,{}])(2))?{}) < 2 then 
				continue; 			-- ignore 2-element groups which will be handled later
			end if;
		
			numgroups_found +:= 1;
			print_group(biggest_group); 			-- report the lines in the group found
			print_protein_group(biggest_group);		

							-- remove all the summary lines in the group just reported from the lines_with_trips map
			remove_lines(biggest_group);		-- and now iterate, finding new groups as long as possible
	
		end loop;
		
		print("num pairs found: ",numgroups_found," ",time()); tot_numgroups_found +:= numgroups_found;
		if numgroups_found > 0 then found_a_group := true; end if;
		
	end loop;
					-- now repeat the search, but dropping the requirement that single triples be confirmend by  
					-- second triples in the all the lines of a group or pair 
	tot_numgroups_found := 0; tot_lines_in_groups_found := 0;			-- estt the counts
	
	print("Start search for unconfirmed groups and pairs: ",time());
	
	printa(ohandle,"\n*********** Unconfirmed groups and pairs ***************\n");	
	found_a_group := true;				-- set flag for while-loop which follows
		
	while found_a_group loop 			-- keep iterating as long as groups of size at least 3 are being found
		
		found_a_group := false;			-- drop flag; will be raised if a group is found
		
 		frequent_triples_by_frequency := merge_sort([[-#lines_in,trip]: [trip,lines_in] in lines_with_trips]);
 					-- sort (or re-sort) the triples by the number of lines in which they appear

		numgroups_found := 0;
		
		for [neg_freq,trip] in frequent_triples_by_frequency loop			-- loop over triples in order of decreasing frequency
	
			if (freq := -neg_freq) < 3 then exit; end if;
						
			tot_lines_in_groups_found +:= #(biggest_group := lines_with_trips(trip));
			numgroups_found +:= 1;
			print_group(biggest_group); 			-- report the lines in the group found
			print_protein_group(biggest_group);		
			
							-- remove all the summary lines in the group just reported from the lines_with_trips map
			remove_lines(biggest_group);		-- and now iterate, finding new groups as long as possible
	
		end loop;
		
		print("numgroups_found: ",numgroups_found," ",time()); tot_numgroups_found +:= numgroups_found;
		if numgroups_found > 0 then found_a_group := true; end if;
		
	end loop;

	print("End search for unconfirmed groups with more than 2 RNAs: ",time()," ",tot_numgroups_found," were found, containing ",tot_lines_in_groups_found," mRNAs");
			
			-- now we repeat the unconfirmed search, this time for pairs
	
	found_a_group := true;				-- set flag for while-loop which follows
	tot_numgroups_found := 0;
	
	while found_a_group loop 
		
		found_a_group := false;			-- drop flag; will be raised if a group is found
		
 		frequent_triples_by_frequency := merge_sort([[-#lines_in,trip]: [trip,lines_in] in lines_with_trips]);
	 					-- sort (or re-sort) the triples by the number of lines in which they appear
	
		numgroups_found := 0;
		
		for [neg_freq,trip] in frequent_triples_by_frequency loop			-- loop over triples in order of decreasing frequency
	
			if (freq := -neg_freq) < 2 then exit; end if;
		
			numgroups_found +:= 1;
			print_group(biggest_group := lines_with_trips(trip)); 			-- report the lines in the group found
			print_protein_group(biggest_group);		
			
							-- remove all the summary lines in the group just reported from the lines_with_trips map
			remove_lines(biggest_group);		-- and now iterate, finding new groups as long as possible
	
		end loop;
		
		print("num pairs found: ",numgroups_found," ",time()); tot_numgroups_found +:= numgroups_found;
		if numgroups_found > 0 then found_a_group := true; end if;
		
	end loop;
						
	print("End search for pairs of RNAs: ",time()," ",tot_numgroups_found," were found");
	
	unassigned_lines := [] +/ [[u: u in y]: [-,y] in lines_with_trips];
	
	unassigned_count := {}; 
	for ix in unassigned_lines loop
		tag := lines(ix)(1);
		unassigned_count(tag) := (unassigned_count(tag)?0) + 1;
	end loop ;
	
	print("Unassigned lines: ",unassigned_count);
	
	print("Done triples analysis: ",time());

end triple_occurences;

	procedure show_moduli(thist_line);			-- transform a .thist line to show exon moduli and start of reading frame	

return thist_line;			-- disable till precise exon boundaries are available

		rspan(thist_line,"] \t"); tail := rbreak(thist_line,"["); rspan(thist_line,"[ \t");
		start_of_coding := unstr(tail);
		pieces := breakup(thist_line,"\t");
--print("pieces: ",pieces);		
		exlens := [unstr(x): x in breakup(pieces(6),",")];
		exlens(1) := exlens(1) - start_of_coding; 
		
		mod_tags := [current_modtag := 0];
		
		for j in [2..#exlens] loop 
			mod_tags with:= (new_tag := ((current_modtag + exlens(j - 1)) mod 3)); 
			current_modtag := new_tag;
		end loop;
		
		annotated_exon_lengths := str(start_of_coding) + "+" + join([str(el) + "." + str(mod_tags(j)): el = exlens(j)],",");
		pieces(6) := annotated_exon_lengths;
			 	
		return join(pieces,"\t");
				
	end show_moduli;
	
procedure hv_sum(hv); hv_debug := hv; return 0 +/[num: [tag,num] in hv]; end hv_sum;
	
	procedure data_blocks_in(lines);			-- extract the data blocks from a Genbank record
		
		if lines = OM then return []; end if;
		
		nl := #(lines := [[j,line]: line = lines(j) | only_allowed_chars(line)]);
		jumps := [k: [j,-] = lines(k) | k = 1 or lines(k - 1)(1) /= j - 1];
		jumps with:= (nl + 1);
	
		return [[p(2): p in lines(jumps(j)..jumps(j + 1) - 1)]: j in [1..#jumps - 1]];
		
	end data_blocks_in; 
	
	
	procedure only_allowed_chars(line);		-- test a line to see of it looks data-like
	
		rspan(line," \t\"");
		front := span(line,allowed_prefix_chars); 
		maytr := match(line,"/traslation"); if maytr /= "" then span(line," \t\"="); end if;
		midA := span(line,allowed_pept_block_charsA);
		midB := span(line,allowed_pept_block_charsB);
		back := span(line,allowed_prefix_chars);
	
		return line = "" and #front > 4; 
		
	end only_allowed_chars; 
		
	procedure int_to_4(n);			-- convert integer to 4 bytes 
		n1 := char(n mod 256); n /:= 256;			
		n2 := char(n mod 256); n /:= 256;			
		n3 := char(n mod 256); n /:= 256;			
		return char(n mod 256) + n3 + n2 + n1;			
	end int_to_4;

	procedure int_to_2(n);			-- convert integer to 2 bytes 			
		n1 := char(n mod 256); n /:= 256;			
		return char(n mod 256) + n1;			
	end int_to_2;

	procedure i4_to_i(stg);			-- convert 4 bytes to integer
		return abs(stg(4)) + 256 * (abs(stg(3)) + 256 * (abs(stg(2)) + 256 * abs(stg(1)))); 			
	end i4_to_i;
	
	procedure i2_to_i(stg);			-- convert 2 bytes to integer			
		return abs(stg(2)) + 256 * abs(stg(1)); 			
	end i2_to_i;
	
end genetic_code_pak;

program test;				-- calls to unigene analysis procedures; see comments in preceding package on files appearing below		
	use string_utility_pak,get_lines_pak,sort_pak,rix_flat_pak,genetic_code_pak; 		-- use rix_flat_pak to get the gene annotations	or data			

	-- **************************************************************************************
	-- ************************************* test calls *************************************
  	-- **************************************************************************************

---> program

--	peek("Macintosh HD:GbAccList.0912.2004",10000,500);					-- peek at start of specified file 

				-- ********* routines related to the NCBI active list *********
--	survey_ncbi_active_list("Macintosh HD:GbAccList.0912.2004");
--	i ndex_ncbi_active_list("Macintosh HD:GbAccList.0912.2004");	-- index an NCBI active list, producing and writing an index flat
--	i ndex_ncbi_active_list("gbacclist.test");	-- index an NCBI active list, producing and writing an index flat
--	i ndex_ncbi_active_list("Macintosh HD:GbAccList.test");			-- index the main NCBI active list, producing and writing its index flat
--handle := open("Macintosh HD:gbacclist.test","TEXT-IN"); for j in [1..100] loop geta(handle,x); front := break(x,","); print(front + x," ",ncbi_hash(front)); end loop;

--	i ndex_ncbi_active_list("GbAccList.test");			-- index the main NCBI active list, producing and writing its index flat
--handle := open("gbacclist.test","TEXT-IN"); for j in [1..100] loop geta(handle,x); front := break(x,","); print(front + x," ",ncbi_hash(front)); end loop; stop; 

--	print(get_by_gi(52024523));						-- get a genbank record by its genbank gi number
--	print(get_by_gi(52024142));
--	print(get_by_gi(3358)); stop;
--	print("fetching; ","X52700"," ",time()); print(join(data_blocks_in(gb_record("X52700"))(1),"\n"));
--	print("fetching; ","CAA44904"," ",time()); print(join(data_blocks_in(gb_record("CAA44904"))(1)?["BAD"],"\n"));
--	print("fetching; ","CAA35955"," ",time()); print(join(data_blocks_in(gb_record("CAA35955"))(1)?["BAD"],"\n"));
--	print("fetching; ","X53413"," ",time()); print(join(data_blocks_in(gb_record("X53413"))(1)?["BAD"],"\n"));
--	print("fetching; ","A52422"," ",time()); print(join(data_blocks_in(gb_record("A52422"))(1)?["BAD"],"\n"));
--	print("fetching; ","I46638"," ",time()); print(join(data_blocks_in(gb_record("I46638"))(1)?["BAD"],"\n"));

--	get_sodium_channel_data("sodium_channels_subset");

--	examine_unigene_indices("LcXlHsMmDr.rix");
-- 	protein_alignment_test();			-- test the protein alignment function
--	pairwise_align_potassium_family("potassium_channels_subfamily_k");		-- pairwise alignment of a file of header/data line pairs
 	
  	-- **************************************************************************************
	-- ********** calls to build the collection of files used for Unigene analysis **********
  	-- **************************************************************************************

--	b uild_unigene_index("Sma.seq.uniq,Bmo.seq.uniq","LcXl.rix");
--	b uild_unigene_index("Mm.seq.uniq,Hs.seq.uniq,Dr.seq.uniq,Xl.seq.uniq,Lco.seq.uniq","LcXlHsMmDr.rix");
				-- index the indicated list of unigene files, saving the resulting index in the indicated '.rix' file

--	f ilter_psl(to_genome_lengths_prefix + "zebrafish.psl"); 		-- filter one of Toto's .psl output files 
--	f ilter_psl(to_genome_lengths_prefix + "celegans.psl");			-- filter one of Toto's .psl output files 
--	f ilter_psl(to_genome_lengths_prefix + "Chimp.psl");			-- filter one of Toto's .psl output files 
--	f ilter_psl(to_genome_lengths_prefix + "Drosophila.psl");		-- filter one of Toto's .psl output files 
--	f ilter_psl(to_genome_lengths_prefix + "human.psl");			-- filter one of Toto's .psl output files 

									-- merge a list of psl summary files 
--	m erge_psl_summaries(to_genome_lengths_prefix + "Human#H," + to_genome_lengths_prefix + "zebrafish#Z",to_genome_lengths_prefix + "Hozeb"); stop;
--	m erge_psl_summaries(to_genome_lengths_prefix + "Human#H," + to_genome_lengths_prefix + "Drosophila#D",to_genome_lengths_prefix + "Hodro");

--	t riple_occurences(to_genome_lengths_prefix + "homus.smerge","LcXlHsMmDr.rix");		-- prepare 'homus.thist' file, with proteins
--	t riple_occurences(to_genome_lengths_prefix + "hozeb.smerge","LcXlHsMmDr.rix");		-- prepare 'hozeb.thist' file, with proteins
--	triples_in_several(to_genome_lengths_prefix + "Hoeleg.smerge");	-- look for cross-species triples; for use with remotely related species
--	exon_length_compare_histogram(to_genome_lengths_prefix + "Celegans.psl.n",
--				preefix + "Human.psl.n");		-- compare histogram of exon lengths

	 
	lines := prepare_test_group("test_group"); tags := [line(1): line in lines]; 
	lines := [line(3..): line in lines]; group_strings_by_alignment(lines,tags,3);

procedure get_sodium_channel_data(file_name);

	ids := [breakup(line," ")(2): line in get_lines(file_name)];				
	ohandle := open ("junk","TEXT-OUT");
	for id = ids(j) loop
		print(j," fetching; ",id," ",time()," ",join(data_blocks_in(gb_record(id))(1)?["BAD"],"\n"));
		printa(ohandle,j," fetching; ",id," ",time()," ",join(data_blocks_in(gb_record(id))(1)?["BAD"],"\n"));
	end loop;

	print("Done");
	
end get_sodium_channel_data;

procedure protein_alignment_test;						
	
	prot1 := "MSAGSATHPAAGGRRSKWDQPAPAPLLFLPPTAPGGEVAGSGASPGGATTAAPSGALDAAAAVAAKINAMLMAKGKLKPSQNAAE" + 
				"KLQAPGKSLTSNKSKDDLVVAEVEINDVPLTCRNLLTRGQTQDEISRLSGAAVSTRGRFMTTEEKAKVGPGDRPLYLHVQGQTREL" + 
				"VDRAVNRIKEIITNGVVKAATGTSPTFNGATVTVYHQPAPIAQLSPAINQKPSFQSGMHYVQDKLFVGLEHAVPTFNVKEKVEGPG" + 
				"CSYLQHIQIETGAKVFLRGKGSGCIEPASGREAFEPMYIYISHPKPEGLAAAKKLCENLLQTVHAEYSRFVNQINTAVPLPGYTQP" + 
				"SAISSIPPQPPYYPSNGYQSGYPVVPPPQQPVQPPYGVPSIVPPAVSLAPGVLPALPTGVPPVPTQYPITQVQPPASTGQSPISAPF" + 
				"IPAAPVKTALPTGPQPQPQLPAQPQSQKRRFTEELPDERDSGLLGYQVQASPVRMR";
	prot2 := "MSAGSATHPGAGGRRSKWDQPAPAPLLFLPPAAPGGEVTSSGGSPGGTTAAPSGALDAAAAVAAKINAMLMAKGKLKPTQNASEKLQ" + 
				"APGKGLTSNKSKDDLVVAEVEINDVPLTCRNLLTRGQTQDEISRLSGAAVSTRGRFMTTEEKAKVGPGDRPLYLHVQGQTRELVDRA" + 
				"VNRIKEIITNGVVKAATGTSPTFNGATVTVYHQPAPIAQLSPAVSQKPPFQSGMHYVQDKLFVGLEHAVPTFNVKEKVEGPGCSY" + 
				"LQHIQIETGAKVFLRGKGSGCIEPASGREAFEPMYIYISHPKPEGLAAAKKLCENLLQTVHAEYSRFVNQINTAVPLPGYTQPSA" + 
				"ISSVPPQPPYYPSNGYQSGYPVVPPPQQPVQPPYGVPSIVPPAVSLAPGVLPALPTGVPPVPTQYPITQVQPPASTGQSPMGGPFI" + 
				"PAAPVKTALPAGPQPQPQPQPPLPSQPQAQKRRFTEELPDERESGLLGYQHGPIHMTNLGTGFSSQNEIEGAGSKPASSSGKERE" + 
				"RDRQLMPPPAFPVTGIKTESDERNGSGTLTGSHGECDIAGGTGEWLRLV";

	prot_NM_131868 := "MGAGASAEEKHSRELEKKLKEDADKDARTVKLLLLGAGESGKSTIVKQMKIIHKDGYSLEECLEFI" + 
					"VIIYSNTMQSILAVVRAMTTLNIGYGDAAAQDDARKLMHLADTIEEGTMPKELSDIILRLWKDSGIQ" + 
					"ACFDRASEYQLNDSAGYYLNDLERLIQPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQR" + 
					"SERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDV" + 
					"FVEKIKKAHLSMCFPEYDGPNTFEDAGNYIKVQFLDLNLRRDIKEIYSHMTCATDTENVKFVFDAVTD" + 
					"IIIKENLKDCGLF";


	prot_NM_006496 := "MGCTLSAEDKAAVERSKMIDRNLREDGEKAAKEVKLLLLGAGESGKSTIVKQMKIIHEDGYSEDECKQYK" + 
					"VVVYSNTIQSIIAIIRAMGRLKIDFGEAARADDARQLFVLAGSAEEGVMTPELAGVIKRLWRDGGVQA" + 
					"CFSRSREYQLNDSASYYLNDLDRISQSNYIPTQQDVLRTRVKTTGIVETHFTFKDLYFKMFDVGGQRS" + 
					"ERKKWIHCFEGVTAIIFCVALSDYDLVLAEDEEMNRMHESMKLFDSICNNKWFTETSIILFLNKKDLF" + 
					"EEKIKRSPLTICYPEYTGSNTYEEAAAYIQCQFEDLNRRKDTKEIYTHFTCATDTKNVQFVFDAVTDV" + 
					"IIKNNLKECGLY";

	prot_BC038795 := "MHLKLKGNLALSQAFSKDSFPKLRILEVPYAYQCCPYGMCASFFKASGQWEAEDLHLDDEESSKRPLGLLAR" + 
					"QAENHYDQDLDELQLEMEDSKPHPSVQCSPTPGPFKPCEYLFESWGIRLAVWAIVLLSVLCNGLVLLTV" + 
					"FAGGPAPLPPVKFVVGAIAGANTLTGISCGLLASVDALTFGQFSEYGARWETGLGCRATGFLAVLGSEA" + 
					"SVLLLTLAAVQCSVSVSCVRAYGKSPSLGSVRAGVLGCLALAGLAAALPLASVGEYGASPLCLPYAPPE" + 
					"GQPAALGFTVALVMMNSFCFLVVAGAYIKLYCDLPRGDFEAVWDCAMVRHVAWLIFADGLLYCPVAFLS" + 
					"FASMLGLFPVTPEAVKSVLLVVLPLPACLNPLLYLLFNPHFRDDLRRLRPRAGDSGPLAYAAAGELEKS" + 
					"SCDSTQALVAFSDVDLILEASEAGRPPGLETYGFPSVTLISCQQPGAPRLEGSHCVEPEGNHFGNPQPSM" + 
					"DGELLLRAEGSTPAGGGLSGGGGFQPSGLAFASHV";

	prot_AF055585 := "MRGVGWQMLSLSLGLVLAILNKVAPQACPAQCSCSGSTVDCHGLALRSVPRNIPRNTERLDLNGNNITRITK" + 
					"TDFAGLRHLRVLQLMENKISTIERGAFQDLKELERLRLNRNHLQLFPELLFLGTAKLYRLDLSENQIQA" + 
					"IPRKAFRGAVDIKNLQLDYNQISCIEDGAFRALRDLEVLTLNNNNITRLSVASFNHMPKLRTFRLHSNN" + 
					"LYCDCHLAWLSDWLRQRPRVGLYTQCMGPSHLRGHNVAEVQKREFVCSGHQSFMAPSCSVLHCPAACTC" + 
					"SNNIVDCRGKGLTEIPTNLPETITEIRLEQNTIKVIPPGAFSPYKKLRRIDLSNNQISELAPDAFQGLR" + 
					"SLNSLVLYGNKITELPKSLFEGLFSLQLLLLNANKINCLRVDAFQDLHNLNLLSLYDNKLQTIAKGTFS" + 
					"PLRAIQTMHLAQNPFICDCHLKWLADYLHTNPIETSGARCTSPRRLANKRIGQIKSKKFRCSGTEDYRS" + 
					"KLSGDCFADLACPEKCRCEGTTVDCSNQKLNKIPEHIPQYTAELRLNNNEFTVLEATGIFKKLPQLRKI" + 
					"NFSNNKITDIEEGAFEGASGVNEILLTSNRLENVQHKMFKGLEKPQNLMLRSNRITCVGNDSFIGLSSV" + 
					"RMLSLYDNQITTVAPGAFDTLHSLSTLNLLANPFNCNCYLAWLGEWLRKKRIVTGNPRCQKPYFLKEIP" + 
					"IQDVAIQDFTCDDGNDDNSCSPLSRCPTECTCLDTVVRCSNKGLKVLPKGIPRDVTELYLDGNQFTLVP" + 
					"KELSNYKHLTLIDLSNNRISTLSNQSFSNMTQLLTLILSYNRLRCIPPRTFDGLKSLRLLSLHGNDISV" + 
					"VPEGAFNDLSALSHLAIGANPLYCDCNMQWLSDWVKSEYKEPGIARCAGPGEMADKLLLTTPSKKFTCQ" + 
					"GPVDVNILAKCNPCLSNPCKNDGTCNSDPVDFYRCTCPYGFKGQDCDVPIHACISNPCKHGGTCHLKEG" + 
					"EEDGFWCICADGFEGENCEVNVDDCEDNDCENNSTCVDGINNYTCLCPPEYTGELCEEKLDFCAQDLNP" + 
					"CQHDSKCILTPKGFKCDCTPGYVGEHCDIDFDDCQDNKCKNGAHCTDAVNGYTCICPEGYSGLFCEFSP" + 
					"PMVLPRTSPCDNFDCQNGAQCIVRINEPICQCLPGYQGEKCEKLVSVNFINKESYLQIPSAKVRPQTNI" + 
					"TLQIATDEDSGILLYKGDKDHIAVELYRGRVRASYDTGSHPASAIYSVETINDGNFHIVELLALDQSLS" + 
					"LSVDGGNPKIITNLSKQSTLNFDSPLYVGGMPGKSNVASLRQAPGQNGTSFHGCIRNLYINSELQDFQK" + 
					"VPMQTGILPGCEPCHKKVCAHGTCQPSSQAGFTCECQEGWMGPLCDQRTNDPCLGNKCVHGTCLPINAF" + 
					"SYSCKCLEGHGGVLCDEEEDLFNPCQAIKCKHGKCRLSGLGQPYCECSSGYTGDSCDREISCRGERIRD" + 
					"YYQKQQGYAACQTTKKVSRLECRGGCAGGQCCGPLRSKRRKYSFECTDGSSFVDEVEKVVKCGCTRCVS";

	prot_NM_144499 := "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAII" + 
					"YGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACF" + 
					"ERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSE" + 
					"RKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFF" + 
					"EKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIII" + 
					"KENLKDCGLF";

	prot_NM_144499x := "MGAGASAEEKHSRELEKKLKEDAEKDARTVKxxLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAII" + 
					"YGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACF" + 
					"ERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSE" + 
					"RKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFF" + 
					"EKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIII" + 
					"KENLKDCGLF";

	prot_BC049537 := "MIILYFLTLRLLKSFRSLLNFEKMGCTLSSDDKSAQERSKMIDRNLRDDGEKASREVKLLLLGAGESGKSTIVKQM" + 
					"KIIHEAGYSEEECKQYRVVVYSNTIQSIIAIIRAMGRLRIDFADPERADDARQLFVMAGSVDEGFMTS" + 
					"ELAGVIKRLWRDEGVRLCFHRSREYQLNDSASYYLNDLDRISNSSYVPTQQDVLRTRVKTTGIVETHF" + 
					"TFKDLHFKMFDVGGQRSERKKWIHCFEGVTSIIFCVSLSDYDLVLAEDEEMNRMHESMKLFDSICNNK" + 
					"WFTDTSIILFLNKKDLFEEKIRMSPLSICFPEYPGSDVYEEAAAYIQCQFEDLNKRKDTKEIYSHFTC" + 
					"ATDTKNVQFVFDAVTDVIIKNNLKDCGLF";
	prot_AF403384 := "MIVFLVFKHLFSLRLITMFFLLHFIVLINVKDFALTQGSMITPSCQKGYFPCGNLTKCLPRAFHCDGKDDCGNGADE" + 
					"QENCGDTSGWATIFGTVHGNANSVALTQECFLKQYPQCCDCKETELECVNGDLKSVPMISNNVTLLSLKKNKIH" + 
					"QSLPDKVFIKYTKLKKIFLQHNCIRHISRKAFFGLCNLQILYLNHNCITTLRPGIFKDLHQLTWLILDDNPITR" + 
					"QISQRLFTGLNSLFFLSMVNNYLEALPKQMCAQMPQLNWVDLEGNRIKYLTNSTFLSCDSLTVLFLPRNQIGFV" + 
					"QPEKTFSSLKNLGELDLSSNTITELSPHLFKDLKLLQKLNLSSNPLMYLHKNQFESLKQLQSLDLERIEIPNIN" + 
					"QTRMFQPMKNLSHIYFKNFRYCSYAPHVRICMPLTDGISSFEDLLANNILRIFVWVIAFITCFGNLFVIGMRSF" + 
					"QIKAENTTHAMSIKILCCADCLMGVYLFFVGIFDIKYRGQYQKYALLWMESVQCRLMGFLAMLSTEVSVLLLTYL" + 
					"QTLEKFLVIVFPFSNIRPGKRQTSVILICIWMAGFLIAVIPFWNKDYFGNFYGKNGVCFPLYYDQTEDIGSKGYSL" + 
					"QGIFLGVNLLAFLIIVFSYITMFCSIQKTALQTTEVRNCFGREVAVANRFFFIVFSDAICWIPVFVVKILSLFRVE" + 
					"QIPDTMTSWIVIFFLPVNSALNPILYTLTTNFFKDKLKQLLHKHQRKSIFKIKKKSLSTSIVWIEDSSSLKLGVLNKITLGDSIMKPVS";
					
	prot_BX647980 := "MTSGSVFFYILIFGKYFSHGGGQDVKCSLGYFPCGNITKCLPQLLHCNGVDDCGNQADEDNCGDNNGWSLQFDKYFASYY" + 
					"QKMTSQYPFEAETPECLVGSVPVQCLCQGLELDCDETNLRAVPSVSSNVTAMSLQWNLIRKLPPDCFKNYHDLQKLYL" + 
					"QQNNKITSISIYAFRGLNSLTKLYLSHNRITFLKPGVFEDLHRLEWLIIEDNHLSRISPPTFYGLNSLILLVLMNNVL" + 
					"QTRLPDKPLCQHMPRLHWLDLEGNHIHNLRNLTFISCSNLTVLVMRKNKINHLNENTFAPLQKLDELDLGSNKIENLP" + 
					"QPLIFKDLKELSQLNLSYNPIQKIQANQFDYLVKLKSLSLEGIEISNIQQRMFRPLMNLSHIYFKKFQYCGYAPHVRS" + 
					"QCKPNTDGISSLENLLASIIQRVFVWVVSAVTCFGNIFV";                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           				

	hum_K := "AGRRRCPEERGGRARGGAGGGREPGPGGGGGGGARRGGGPRSGRSAALALALALAAAVEKMLQSLAGSSCVRLVER" + 
				"HRSAWCFGFLVLGYLLYLVFGAVVFSSVELPYEDLLRQELRKLKRRFLEEHECLSEQQLEQFLGRVLEASNYGVSV" + 
				"LSNASGNWNWDFTSALFFASTVLSTTGYGHTVPLSDGGKAFCIIYSVIGIPFTLLFLTAVVQRITVHVTRRPVLYF" + 
				"HIRWGFSKQVVAIVHAVLLGFVTVSCFFFIPAAVFSVLEDDWNFLESFYFCFISLSTIGLGDYVPGEGYNQKFREL" + 
				"YKIGITCYLLLGLIAMLVVLETFCELHELKKFRKMFYVKKDKDEDQVHIIEHDQLSFSSITDQAAGMKEDQKQNEP" + 
				"FVATQSSACVDGPANH"; 

	mouse_K := "PRVRPRVRREARSAGAGKMLQSLAGSSCVRLVERHRSAWCFGFLVLGYLLYLVFGAVVFSSVELPYEDLLRQELRK" + 
				"LKRRFLEEHECLSEPQLEQFLGRVLEASNYGVSVLSNASGNWNWDFTSALFFASTVLSTTGYGHTVPLSDGGKAFC" + 
				"IIYSVIGIPFTLLFLTAVVQRVTVHVTRRPVLYFHIRWGFSKQVVAIVHAVLLGFVTVSCFFFIPAAVFSVLEDDW" + 
				"NFLESFYFCFISLSTIGLGDYVPGEGYNQKFRELYKIGITCYLLLGLIAMLVVLETFCELHELKKFRKMFYVKKDK" + 
				"DEDLVHIMEHDQLSFSSVTEQVAGLKEEQKQSEPFVASQSPPYEDGSADH";
	
--	print(align_by_mers(prot_AF403384,prot_BX647980,2));	-- harder alignment
--	print(align_by_mers(prot_BC038795,prot_AF055585,2));	-- harder alignment
--	print(align_by_mers(prot_NM_131868,prot_NM_006496,2));	-- easy alignment
--	print(align_by_mers(prot1,reverse_dna(prot2),2));		-- gives no signal
	print(align_by_mers(hum_K,mouse_K,2));					-- easy alignment
	print(align_by_mers(mouse_K,hum_K,2));					-- easy alignment, dual form
	
end protein_alignment_test;

procedure prepare_test_group(file_name);		-- prepare a multi-protein test group for alignment

	tags := [lines(j)(1): j in [1..nl := #(lines := get_lines(file_name))/2]]; 
	return [tag + "." + lines(j + nl): tag = tags(j)];
	
end prepare_test_group;

procedure pairwise_align_potassium_family(file_name);		-- pairwise alignment of a file of header/data line pairs

	lines := get_lines(file_name);
print("#lines: ",#lines);	
	for linj = lines(j), link = lines(k) | j > k and #linj > 0  and linj(1) = "M" and #link > 0  and link(1) = "M" loop

		print(j/2,":",k/2,if (abm := align_by_mers(linj,link,3)) = OM then " No alignment signal" else " match histogram: " + abm(3) + "\n" + join(abm(1..2),"\n") end if);
	end loop;

end pairwise_align_potassium_family; 

procedure examine_unigene_indices(saved_index_name);
				-- examine a unigene index
						-- bumped at levels shown [54570, 6495, 222] while building LcXlHsMmDr.rix 15:52:59

		rix_object := open_rix(saved_index_name,find_genbank_id);		-- reopen the saved rix, supplying key finder
						
																	-- reread the data
		print(actual_data(get_record(rix_object,"AV772144")));
		print(actual_data(get_record(rix_object,"CN825131"))); 		-- last record lotus
		print(actual_data(get_record(rix_object,"BC072736")));
		print(actual_data(get_record(rix_object,"CO554181"))); 		-- last record frog
		print(actual_data(get_record(rix_object,"NM_007570")));
		print(actual_data(get_record(rix_object,"BB616724"))); 		-- last record mouse
		print(actual_data(get_record(rix_object,"BX645668")));
		print(actual_data(get_record(rix_object,"CR603609"))); 		-- last record human
		print(actual_data(get_record(rix_object,"AY423021")));
		print(actual_data(get_record(rix_object,"CO931431"))); 		-- last record zebrafish

end examine_unigene_indices;

end test;

program test;        -- SETL interactive interface example 1 
	use tkw,string_utility_pak,genetic_code_pak,sort_pak;    -- use the main widget class
 	var txt,txt2;									-- the two text widgets
 	var histo_wind := OM,histo_canv;							-- histogrmming window
 	
	const protein_quality := {["A","h"],["G","h"],["V","h"],["L","h"],["I","h"],["P","h"],["F","h"],["M","h"],
			["W","h"],["C","h"],["N","n"],["Q","n"],["S","n"],["T","n"],["Y","n"],
			["R","p"],["K","p"],["H","p"],["D","a"],["E","a"]};
							  -- h = hydrocarbon, a = acid, b = basic, n = polar neutral

	var Tk;             -- globalize for use in procedure below
 
	Tk := tkw(); Tk(OM) := "interactive genomic utilities";
             -- create the Tk interpreter
 	topframe := Tk("frame","10,10"); topframe("side") := "top"; 
 	txt0 := topframe("text","102,1"); txt0("side") := "top"; txt0(OM) := 12 * "123456789."; 
	txt0("state,wrap,font") := "disabled,none,{Monaco 12 bold}";
	topleftframe := topframe("frame","10,10"); topleftframe("side") := "top"; 
	txt := topleftframe("text","100,15"); txt("side") := "left";
         -- create and place a text widget
             -- put some tagged text into it
	txt("wrap,font") := "none,{Monaco 12 bold}";

	protstg := "" +/ [j * ("AGVLIPPM??WCNQSTYRKHDE" + j) + "\n": j in [1..20]];
	
--	txt(OM) := tag_protein(protstg);
	txt(OM) := protstg;
 
	txt("h","font,foreground") := "{Monaco 12 bold},black";		-- hydrocarbons are black
    	-- set the first tag's attributes
	txt("p","font,foreground") := "{Monaco 12 bold},#CC22FF";		-- bases  are modified magenta 
	txt("a","font,foreground") := "{Monaco 12 bold},white";		-- acids are white
	txt("n","font,foreground") := "{Monaco 12 bold},green";	-- polars are green
	txt("u","font,foreground") := "{Monaco 12 bold italic},#FFAAAA";	-- unknowns are italicised orange
	
 	sch := topframe("scrollbar","h,18"); sch("side,fill") := "top,x"; 
 	scv := topleftframe("scrollbar","v,18"); scv("side,fill") := "left,y"; 
	txt("xscroller") := sch;     -- attach horizontal scrollbar
	txt("yscroller") := scv;     -- attach vertical scrollbar
	txt("background,wrap,state") := "#888888,none,normal";

	butframe := Tk("frame","10,10"); butframe("side") := "top";
	but := butframe("button","Protein Colors"); but("side") := "left";
	but{OM} := color_proteins;	-- or any other parameterless function
	but2 := butframe("button","DNA complement"); but2("side") := "left"; but2{OM} := dna_complement;
	but3 := butframe("button","Rev. comp."); but3("side") := "left"; but3{OM} := rev_complement;
	but4 := butframe("button","Trans. to prot."); but4("side") := "left"; but4{OM} := trans_6_ways;
	but4a := butframe("button","Best prot."); but4a("side") := "left"; but4a{OM} := find_best_trans;
	but5 := butframe("button","Cleanup"); but5("side") := "left"; but5{OM} := cleanup_bottom;
	but6 := butframe("button","Align 2"); but6("side") := "left"; but6{OM} := align2;
	but7 := butframe("button","Histogram"); but7("side") := "left"; but7{OM} := histogram1;
	but8 := butframe("button","Histo. 2"); but8("side") := "left"; but8{OM} := histogram2;
	but9 := butframe("button","To bot"); but9("side") := "left"; but9{OM} := to_bottom;
	but10 := butframe("button","Clear"); but10("side") := "left"; but10{OM} := clear_bottom;

	botframe := Tk("frame","10,10"); botframe("side") := "top"; 
 	txt2 := botframe("text","102,15"); txt2("side") := "left";
 	txt2("background,font") := "gold,{Monaco 12 bold}";
 	bscv := botframe("scrollbar","v,18"); bscv("side,fill") := "left,y"; 
	txt2("yscroller") := bscv;     -- attach vertical scrollbar

   Tk.mainloop();    -- enter the Tk main loop

	procedure color_proteins();		-- tag the text in text 2 and put it into text 1
		scale_line := "\n<`a`>" +/ (10 * [9 * " " + j: j in "1234567890"]) + "<``a`>";
		botstring := txt2(OM); firstline := break(botstring,"\n\r");
		txt(OM) := tag_protein(case_change(suppress_chars(txt2(OM)," \t\n\r0123456789"),"lu")) +  scale_line; --print(result);
	end color_proteins;

	procedure dna_complement();		-- put the DNA complement of the text in text 2 into text 1
				-- use 'N' for letter not designating bases
		botstring := txt2(OM); firstline := break(botstring,"\n\r");
		txt(OM) := "" +/[complement(c)?"N": c in firstline];
	
	end dna_complement;

	procedure rev_complement();		-- put the text in text 2 into text 1
		botstring := txt2(OM); firstline := break(botstring,"\n\r");
		txt(OM) := "" +/[complement(firstline(j))?"N": j in [#firstline,#firstline - 1..1]];
	end rev_complement;

	procedure trans_6_ways();		-- check that the bottom is a DNA string; translate it to protein in all 6 ways

		botstg := suppress_chars(case_change(txt2(OM),"lu"),"0123456789> \t:,;\n\r");		-- drop extraneous characters and check
		frontstg := span(botstg,"ACTGN");
		
		if botstg /= "" then 
			txt(OM) := "ERROR: string contains characters not designating bases: " + botstg(1);
			return;
		end if;	
		
		revstg := "" +/ [frontstg(j): j in [#frontstg,#frontstg - 1..1]];

		txt(OM) := "+0>\n" + trans_dna(frontstg) + "\n+1>\n" + trans_dna(frontstg(2..)) + "\n+2>\n" + trans_dna(frontstg(3..))
				+ "\n-0>\n" + trans_dna(revstg) + "\n-1>\n" + trans_dna(revstg(2..)) + "\n-2>\n" + trans_dna(revstg(3..));
							-- traslate for positive and negative strand, all 3 frames

	end trans_6_ways;

	procedure find_best_trans();		-- check that the bottom is a DNA string; translate it to protein in all 6 ways

		botstg := suppress_chars(case_change(txt2(OM),"lu"),"0123456789> \t:,;\n\r");		-- drop extraneous characters and check
		frontstg := span(botstg,"ACTGN");
		
		if botstg /= "" then 
			txt(OM) := "ERROR: string contains characters not designating bases: " + botstg(1);
			return;
		end if;	
		
		tags := ["+0> ","+1> ","+2> ","-0> ","-1> ","-2> "];
		revstg := "" +/ [frontstg(j): j in [#frontstg,#frontstg - 1..1]];
		
		trans_w_lens := [best_trans_dna(frontstg),best_trans_dna(frontstg(2..)),best_trans_dna(frontstg(3..))] +
								[best_trans_dna(revstg),best_trans_dna(revstg(2..)),best_trans_dna(revstg(3..))];
--print("trans_w_lens: ",trans_w_lens);
		max_max := max/[ln: [ln,stg] in trans_w_lens];
		
		must := exists [ln,stg] = trans_w_lens(j) | ln = max_max; 
	
		txt(OM) := tags(j) + ln + ": " + stg; 	-- display best translation

	end find_best_trans;

	procedure trans_dna(stg);		-- replace /r characters in standard translation by /n
		
		return join([str(#stg) + ": " + stg: stg in breakup(translate_dna(stg),"\r")],"\n");
		
	end trans_dna;

	procedure best_trans_dna(stg);		-- replace /r chanracters in standard translation by /n
		
		maxl := max/ [#stg: stg in (bus := breakup(translate_dna(stg),"\r"))];
		
		must := exists stg in bus | #stg = maxl; 

		return [maxl,stg];
		
	end best_trans_dna;

	procedure cleanup_bottom();		-- remove extraneous characters and capitalize entire bottom; then put into top
		txt(OM) := cleanup(txt2(OM));
	end cleanup_bottom;

	procedure cleanup(stg);		-- remove extraneous characters and capitalize
		return suppress_chars(case_change(stg,"lu"),"0123456789> \t:,;+-");
	end cleanup;

	procedure align2();		-- align two bottom strings, which are separated by an empty line
		
		lines := [trim(line): line in breakup(txt2(OM),"\n\r")];

		if #lines = 2 then			-- use these two lines
			[astg1,astg2,-] := align_by_mers(lines(1),lines(2),2);
			txt(OM) := astg1 + "\n" + astg2;
		end if;
		
		if (not (exists line = lines(j) | line = "")) or j = #lines then
			txt(OM) := "*******Error: two input lines are not available";
			return;
		end if;

		top := if exists line = lines(k) | k > j and line = "" and lines(k - 1) /= "" then k else #lines end if;
		stg1 := ostg1 := cleanup("" +/ lines(1..j - 1)); stg2 := ostg2 := cleanup("" +/ lines(j + 1..top));
		front1 := span(ostg1,"ACTGN"); front2 := span(ostg2,"ACTGN");		-- distinguish the 'bases' from the 'peptides' case
		merlen := if stg1 = front1 and stg2 = front2 then 7 else 2 end if;

				-- break off the second string at the empty line following it, if any		
		[astg1,astg2,-] := align_by_mers(stg1,stg2,merlen);
		astg1 := case_change(astg1,"lu"); nas2 := #(astg2 := case_change(astg2,"lu"));
				
				-- write a middle string whose non-blank characters show the differences between the two aligned strings 
		mid_stg := "" +/ [if c = astg2(j) then " " else c end if: c = astg1(j) | j <= nas2];
	
		txt(OM) := astg1 + "\n" + mid_stg + "\n" + astg2;

	end align2;

	procedure trim(line);		-- trim whitespace at tront of a line
		span(line," \t"); return line;
	end trim;

	procedure histogram(tup);		-- set up captions and call histogramming routine

		sample_domain_item := tup(1)(1)(1);
		chars := test_chars := "" +/ [c: [c,n] in tup(1)]; span(test_chars,"ATCGN");

		caption := if test_chars /= "" then 
						if #sample_domain_item = 1 then "Peptide Histogram" else "Peptide Pair Histogram" end if
					elseif #sample_domain_item = 1 then "Base Histogram" else "Base Pair Histogram" end if;

		display_histogram(tup,"Histogram",caption,[c: [c,n] in tup(1)]);

	end histogram;

	procedure display_histogram(histo_data_list,window_caption,top_label,bot_label_tup);		-- put the text in text 2 into text 1
		
		window_height := 400; window_width := 640;
		nc := #(colors := ["red","green","blue","yellow","cyan","magenta","orange","grey","brown"]);
		
		if not is_tuple(histo_data_list(1)) then histo_data_list := [histo_data_list]; end if;		-- allow a single data tuple as argument
		
		topper := max/ [y: histo_data in histo_data_list, [x,y] in histo_data]; 
		normalized_histos := [[[x,y/topper]: [x,y] in histo_data]: histo_data in histo_data_list];	
		
		if histo_wind = OM then 			-- create a new canvas

			histo_wind := Tk("toplevel",str(window_width) + "," + window_height); histo_wind(OM) := window_caption;
			histo_wind{"Destroy"} := lambda(); histo_wind := OM; end lambda;
			histo_canv := histo_wind("canvas",str(window_width) + "," + window_height); histo_canv("side") := "top";
			hline := histo_canv("line",Tk.tostr([0,window_height - 20,600,window_height - 20]));

		else		-- clear the canvas, and re-open the window containing if necessary
			
			for obj in histo_canv(OM) loop obj.delete(); end loop;

		end if;
 
		label := histo_canv("text",top_label); 
		label("coords") := str(window_width/2) + ",10"; label("font,anchor") := "{Times 14 bold},n";
		nnh := #normalized_histos;
		
		bar_width := ((window_width - 14)/ (nnh * (nnhi1 := #normalized_histos(1))) - 3) max 1;
			-- allow two pixels between adjacent bars and 5 pixels between adjacent 
		interbar_space := (window_width - 14) - bar_width * nnh * nnhi1;
		group_width := bar_width * nnh + interbar_space/nnhi1;

		for normalized_histo = normalized_histos(gno), pair = normalized_histo(k) loop

			hloc := group_width * (k - 1) + bar_width * (gno - 1) + 7;
			rect := histo_canv("rectangle",Tk.tostr([hloc,window_height - 20,hloc + bar_width - 2,window_height - 20 - float(window_height - 60) * pair(2)])); 
			rect("fill") := colors((gno - 1) mod nc + 1); 
		
			for j in [1..nnhi1] loop 
				bpl := histo_canv("text",bot_label_tup(j)); 
				bpl("coords") := Tk.tostr([(j * group_width) - group_width/2 + 4,window_height - 15]); 
				bpl("font,anchor") := "{Times 14 bold},n";
			end loop;
			
		end loop;

	end display_histogram;

	procedure histogram1();		-- histogram the bottom lines, by characters

		lines := [trim(case_change(line,"lu")): line in breakup(txt2(OM),"\n\r") | line /= ""];
		chars := {c: line in lines, c in line};
		
		histo_tup := []; 
		
		for line in lines loop
			histo := {[c,0]: c in chars}; count := 0;
			for c in line loop histo(c) +:= 1; count +:= 1; end loop;
			histo_tup with:= [[c,float(n)/count]: [c,n] in merge_sort(histo)];
		end loop;
		
		histogram(histo_tup);
		
	end histogram1;

	procedure histogram2();		-- histogram the bottom lines, by character pairs

		lines := [trim(case_change(line,"lu")): line in breakup(txt2(OM),"\n\r") | line /= ""];
		chars := {line(j..j + 1): line in lines, j in [1..#line - 1]};
		
		histo_tup := []; 
		
		for line in lines loop
			histo := {[c,0]: c in chars}; count := 0;
			for j in [1..#line - 1] loop histo(line(j..j + 1)) +:= 1; count +:= 1; end loop;
			histo_tup with:= [[c,float(n)/count]: [c,n] in merge_sort(histo)];
		end loop;
		
		histogram(histo_tup);

	end histogram2;

	procedure to_bottom();		-- move top to bottom
		txt2(OM) := res := txt(OM); print("\n",res);
	end to_bottom;

	procedure clear_bottom();		-- clear bottom
		txt2(OM) := "";
	end clear_bottom;

 	procedure tag_protein(stg);		-- add color tags to a protein string
		return "" +/ [tagged(c): c in stg];
		
		procedure tagged(c);			-- tag a character
				return "<`" + protein_quality(c)?"u" + "`>" + c + "<``" + protein_quality(c)?"u" + "`>";

		end tagged;
		
	end tag_protein;
	
 end test;
 
 program test;				-- temporatry collection of human_to_mouse index setup procedures
	use string_utility_pak,sort_pak,get_lines_pak;
--	var master_folder_prefix := "Diana:diana2003:Pub:FromGiuseppeLatest:SETLFolder:";
	var master_folder_prefix := "Diana_backup:";
			-- path to folder containing all the files and folders for the process represented by this code

			-- This is expected to contain the following files an folders, all but the input files being built by this code
			-- HumanGenome, MouseGenome
			-- mer_files_folder		-- the generated files of address_tagged mers, before and after sorting
				-- contains files humangenome.a,humangenome.c,... and mousegenome.a,mousegenome.c,...
			-- HumanMouseMerge		the common uniquely appearing mers form the muman and mouse mer files, with attached offsets
			-- OK_HumanMouseMerge	file of offsets which appear frequently enough to be significant without additional verification
			-- HumanMouseMerge_to_verify	file of offsets wholde appearance requires verification for significance 
											-- added to OK_HumanMouseMerge after verification
			-- HumanMouseMerge_Madresses	the elements in HumanMouseMerge_to_verify, expanded to include peptides and the human address suffix,
											--  but with offsets replaced by mous addresses
			-- HumanMouseMerge_verif_secs	-- the elements in HumanMouseMerge_Madresses, with 50 peptides in each direction, centered on the
											-- 9-mer in the  element, added as a suffix

	var mer_files_prefix := "mer_files_folder:";			-- folder containing set of mer files, before and after sorting by mers
--	const human_chromosomes_prefix := "Diana:untitled folder:HumanGenome:";
--	const mouse_chromosomes_prefix := "Diana:untitled folder:MouseGenome:";
	const human_chromosomes_prefix := "Diana_backup:HumanGenome:";
	const mouse_chromosomes_prefix := "Diana_backup:MouseGenome:";

	var unique_elements_found,duplicates_removed := 0;			-- global counts for removal of duplicates
	var common_mers_found := 0;									-- global count, used in attach_mer_offsets_and_assemble
	const peptide_codes := "ACDEFGHIKLMNPQRSTVWY";			-- one-letter codes for the peptides

	var human_chromosomes := ["chr1.fa", "chr10.fa", "chr11.fa", "chr12.fa", "chr13.fa", "chr14.fa", 
								"chr15.fa", "chr16.fa", "chr17.fa", "chr18.fa", "chr19.fa", "chr2.fa", "chr20.fa",
								 "chr21.fa", "chr22.fa", "chr3.fa", "chr4.fa", "chr5.fa", "chr6.fa", "chr7.fa",
								 "chr8.fa", "chr9.fa", "chrX.fa", "chrY.fa"];


	var mouse_chromosomes := ["chr1.fa", "chr10.fa", "chr11.fa", "chr12.fa", "chr13.fa", "chr14.fa", 
								"chr15.fa", "chr16.fa", "chr17.fa", "chr18.fa", "chr19.fa", "chr2.fa", 
								"chr3.fa", "chr4.fa", "chr5.fa", "chr6.fa", "chr7.fa", "chr8.fa", 
								"chr9.fa", "chrM.fa", "chrX.fa", "chrY.fa"];

	const basic_mer_length := 9, repeated_offset_threshold := 12;
	const chromo_offset := 240000000;		-- nominal offset of genome addresses between successive chromosomes
	const genome_top := 5000000000;			-- nominal maximum length of genome
	const twice_genome_max := 2 * genome_top;
	
	const peptide := 	-- this extends the genetic code to allow 'N' characters, designating unknown bases.
						-- the extension is done in a somewhat arbitrary way, but avoids introducing stops
		{["TTT","F"], ["TTC","F"], ["TTA","L"], ["TTG","L"], ["TCT","S"],
	 	["TCC","S"], ["TCA","S"], ["TCG","S"], ["TAT","Y"], ["TAC","Y"], ["TAA","*"],
	 	 ["TAG","*"], ["TGT","C"], ["TGC","C"], ["TGA","*"], ["TGG","W"], ["CTT","L"],
	 	["CTC","L"], ["CTA","L"], ["CTG","L"], ["CCT","P"], ["CCC","P"], ["CCA","P"],  
	 	["CCG","P"], ["CAT","H"], ["CAC","H"], ["CAA","Q"], ["CAG","Q"], ["CGT","R"], 
	 	["CGC","R"], ["CGA","R"], ["CGG","R"], ["ATT","I"], ["ATC","I"], ["ATA","I"], 
	 	["ATG","M"], ["ACT","T"], ["ACC","T"], ["ACA","T"], ["ACG","T"], ["AAT","N"], 
	 	["AAC","N"], ["AAA","K"], ["AAG","K"], ["AGT","S"], ["AGC","S"], ["AGA","R"], 
	 	["AGG","R"], ["GTT","V"], ["GTC","V"], ["GTA","V"], ["GTG","V"], ["GCT","A"], 
	 	["GCC","A"], ["GCA","A"], ["GCG","A"], ["GAT","D"], ["GAC","D"], ["GAA","E"], 
	 	["GAG","E"], ["GGT","G"], ["GGC","G"], ["GGA","G"], ["GGG","G"],
	 	["TNT","F"], ["TNC","F"], ["TNA","L"], ["TNG","L"],			-- 'N' in the middle position 
	 	["CNT","L"], ["CNC","L"], ["CNA","L"], ["CNG","L"],
	 	["ANT","I"], ["ANC","I"], ["ANA","I"], ["ANG","M"],
	 	["GNT","A"], ["GNC","A"], ["GNA","A"], ["GNG","A"],
	 	["NTT","F"], ["NTC","F"], ["NTA","L"], ["NTG","L"], 		-- 'N' in the left position 
	 	["NCT","S"], ["NCC","S"], ["NCA","S"], ["NCG","S"], 
	 	["NAT","Y"], ["NAC","Y"], ["NAA","S"], ["NAG","S"], 
	 	["NGT","C"], ["NGC","C"], ["NGA","S"], ["NGG","W"],
	 	["TTN","F"], ["TCN","F"], ["TAN","L"], ["TGN","L"], 		-- 'N' in the right position 
	 	["CTN","S"], ["CCN","S"], ["CAN","S"], ["CGN","S"], 
	 	["GTN","C"], ["GCN","C"], ["GAN","S"], ["GGN","W"],
	 	["ATN","C"], ["ACN","C"], ["AAN","S"], ["AGN","W"],
	 	["ANN","C"], ["GNN","C"], ["CNN","S"], ["TNN","W"],		-- two N's
	 	["NAN","C"], ["NGN","C"], ["NCN","S"], ["NTN","W"],
	 	["NNA","C"], ["NNG","C"], ["NNC","S"], ["NNT","W"],
		["NNN","W"]};

	const peptide_number := {["A", 1], ["V", 18], 		-- mapping of 1-letter peptide code into its number
							["W", 19], ["C", 2], ["E", 4], ["G", 6], ["I", 8], 
							["T", 17], ["Y", 20], ["P", 13], ["R", 15], ["K", 9], 
							["N", 12], ["L", 10], ["M", 11], ["F", 5], ["S", 16], 
							["D", 3], ["H", 7], ["Q", 14]};
	
	const divisors := [10000000000, 100000000, 1000000, 10000, 100, 1];
	
	var comp_peptide :=   		-- complement of the 'peptide' mapping
			{["NTC", "S"], ["TGT", "T"], ["AGC", "S"], ["CNA", "A"], ["NGC", "S"], ["NNC", "C"], 
			["CCG", "G"], ["CGA", "A"], ["GGT", "P"], ["ACC", "W"], ["AGG", "S"], ["NCC", "W"],  
			["NGG", "S"], ["TCG", "S"], ["ACG", "C"], ["CCC", "G"], ["NCG", "C"], ["NNG", "S"], 
			["TTC", "K"], ["AAN", "F"], ["ATA", "Y"], ["NTA", "Y"], ["TAA", "I"], ["AAT", "L"], 
			["GAT", "L"], ["NAT", "L"], ["TNA", "I"], ["AGN", "F"], ["ANT", "L"], ["GNT", "L"], 
			["CAG", "V"], ["AAG", "F"], ["CTT", "E"], ["NAG", "F"], ["ANG", "F"], ["GCC", "R"], 
			["GGA", "P"], ["TCC", "R"], ["CAC", "V"], ["CGT", "A"], ["CTA", "D"], ["GTT", "Q"], 
			["GCG", "R"], ["TGA", "T"], ["CNT", "A"], ["GTA", "H"], ["NAN", "W"], ["TTG", "N"], 
			["ATC", "*"], ["CCN", "W"], ["TCN", "W"], ["ANN", "W"], ["NNN", "W"], ["GAN", "S"], 
			["NTN", "C"], ["CAN", "C"], ["CTN", "S"], ["GAA", "L"], ["GTN", "S"], ["TAN", "C"], 
			["TTN", "S"], ["CNN", "C"], ["GCN", "S"], ["GGN", "S"], ["GNA", "L"], ["NGN", "S"], 
			["TAT", "I"], ["TNN", "C"], ["CGN", "C"], ["GNN", "S"], ["NCN", "C"], ["TGN", "C"], 
			["TNT", "I"], ["ATN", "L"], ["GTC", "Q"], ["AGA", "S"], ["CNC", "A"], ["NGA", "S"], 
			["TCA", "S"], ["ACA", "C"], ["ACN", "L"], ["CAT", "V"], ["CGC", "A"], ["NCA", "C"], 
			["CTC", "E"], ["GCT", "R"], ["TCT", "R"], ["CNG", "A"], ["CCA", "G"], ["CGG", "A"], 
			["NNA", "W"], ["ATG", "Y"], ["NTG", "Y"], ["TAG", "I"], ["TAC", "M"], ["ACT", "*"], 
			["TNG", "I"], ["TNC", "M"], ["ATT", "*"], ["CCT", "G"], ["CTG", "D"], ["GCA", "R"], 
			["GGC", "P"], ["TGG", "T"], ["CAA", "V"], ["NTT", "S"], ["AAA", "F"], ["AGT", "S"], 
			["NAA", "F"], ["NGT", "S"], ["NCT", "S"], ["ANA", "F"], ["GGG", "P"], ["TGC", "T"], 
			["NNT", "C"], ["GAG", "L"], ["GNG", "L"], ["GTG", "H"], ["TTA", "N"], ["AAC", "L"], 
			["GAC", "L"], ["NAC", "L"], ["TTT", "K"], ["ANC", "L"], ["GNC", "L"]};

human_chromosomes := ["Human_mini_chromosome"]; mouse_chromosomes := ["mouse_mini_chromosome"];			-- reset to small files for testing 	
human_chromosomes := ["chr1.fa"]; mouse_chromosomes := ["mouse_mini_chromosome"];			-- reset to small files for testing 	
test_numan_mouse_comparison();		-- top level control program for humna/mouse comparison

				-- ******** the sequence of processing steps is ******** 

procedure test_numan_mouse_comparison();		-- top level control program for humna/mouse comparison

	hpref := human_chromosomes_prefix; rmatch(hpref,":"); short_name := rbreak(hpref,":");   
	human_mer_list := [master_folder_prefix + mer_files_prefix + short_name + "." + c: c in peptide_codes];
	mpref := mouse_chromosomes_prefix; rmatch(mpref,":"); short_name := rbreak(mpref,":");   
	mouse_mer_list := [master_folder_prefix + mer_files_prefix + short_name + "." + c: c in peptide_codes];

print("Start: ",time());
	genome_to_mers(human_chromo_list := [human_chromosomes_prefix + cname: cname in human_chromosomes]);			-- generate the human 9-mers files
print("Done making human mers ",time()); print("stopping:"); stop;
	sort_by_mers(human_mer_list);										-- sort the human 9-mers files alphabetically by mers
print("Done sorting human mers ",time());
	prune_duplicate_mers(human_mer_list);		-- prune the duplicated mers from the human 9-mers file
	genome_to_mers(mouse_chromo_list := [mouse_chromosomes_prefix + cname: cname in mouse_chromosomes]);			-- generate the mouse 9-mers files
print("Done making mouse mers ",time());
	sort_by_mers(mouse_mer_list);										-- sort the mouse 9-mers files alphabetically by mers
print("Done sorting mouse mers ",time());
	prune_duplicate_mers(mouse_mer_list);		-- prune the duplicated mers from the human 9-mers file
print("Calculating offsets ",time());
	attach_mer_offsets_and_assemble(human_mer_list,mouse_mer_list);		-- find common mers occuring just once and attach offsets; create HumanMouseMerge
	sort_by_offsets("HumanMouseMerge");									-- sort the HumanMouseMerge file alphabetically by offsets
print("Done sorting by offsets ",time());
	segregate_frequent_offsets("HumanMouseMerge");						-- write the common offsets to one file and those needing verification to another
print("Done segregating frequent offsets ",time());
stop;
																				-- this creates OK_HumanMouseMerge and HumanMouseMerge_to_verify
	offsets_to_M_addresses("HumanMouseMerge_to_verify","HumanMouseMerge");	-- convert the offsets needing verification to mous addresses		
																				-- this creates HumanMouseMerge_Madresses, items are AAAAAAPPPPPPPPPnnnnnn
	build_H_sections("HumanMouseMerge_Madresses");							-- append data blocks to the HumanMouseMerge_Madresses items
																				-- this also uses the human chromosome data files
																				-- it produces a file called HumanMouseMerge_verif_secs 
	sort_by_addresses("HumanMouseMerge_verif_secs");							-- sort the HumanMouseMerge_verif_secs file alphabetically by mouse addresses

end test_numan_mouse_comparison;

procedure genome_to_mers(file_name_list);		-- translates the mers of each chromosome into proteins
												-- in all 3 ways, and breaks these into position-lableled
												-- 9-mers. The position used is that in the underlying chromosome. 
												-- positions are encoded as 6-character strings of the form
												-- Cnnnnn, where the first 24 small letters a,b,.. indicate
												-- positions on the human chromosome, and 12-digit integers are 
												-- encoded in 6 bytes, with 0 represented by "!" = char(33) 
												-- positions on the reverse strand of a chromosome are offset
												-- 5,000,000,000 to give them unique nominal addresses
												 
			-- this produces 20 output files with names like HumanGenome.a, each containing all labeled mers starting with the
			-- peptide code letter that the file name indicates. The lines in these files have the form PPPPPPPPPnnnnnn,
			-- where addresses in sucessive chromosomes are offset by 240,000,000 to make them disjoint even for the largest chromosome
			 
	first_file_name := file_name_list(1); rbreak(first_file_name,":"); rmatch(first_file_name,":");
	short_file_name := rbreak(first_file_name,":");			-- this will be "HumanGenome" or "MouseGenome"
--	short_file_name := "HumanGenome";

	ohandles := {[c,open(master_folder_prefix + mer_files_prefix + short_file_name + "." + c,"TEXT-OUT")]: c in peptide_codes};
				-- open the 20 output files to be used as destinations, each being accessed by a possible first mer letter
print("file_name_list: ",file_name_list);
	for chromo_file = file_name_list(chrom_number) loop				-- distribute each of the chromosome files into the output file
		ihandle := open(chromo_file,"RANDOM");
--print("ihandle: ",ihandle);
		distribute_mers(ihandle,ohandles,chrom_number);
		close(ihandle);							-- release the chromosome file
	end loop;

	for [c,handl] in ohandles loop close(handl); end loop;			-- release all the output files
	
end genome_to_mers;

procedure distribute_mers(chrom_handle,ohandles,chrom_number);
			-- build the mers of a chromosome file and distribute them according to their first letters 
	
	chrom_size := fsize(chrom_handle);			-- get the chromosome size
		-- we buffer the chromosome in by blocks of 1,000 bytes, allowing 12-byte overlaps so that no 9-mers are omitted

	print("Starting to distribute  peptide mers constructed for chromosome: ",chrom_number," ",time()); debug_count := 0;
	
	for j in [1,975..chrom_size - 26] loop

		gets(chrom_handle,j,1000 min (chrom_size - j + 1),chrom_piece);			-- get 1,000 byte section of the chromosome
		chrom_piece := suppress_chars(case_change(chrom_piece,"lu"),"HR>0123456789 \t\n\r");
--if j = 1 then print("j: ",j," ",chrom_piece); end if;	
		for k in [0..#chrom_piece - 27] loop	-- iterate over the section, extracting 27-mers for conversion to protein 9-mers
						-- since (in all but the final j iteration) the last 27-mer handled in this loop is 
						-- j + 973, the outer j iteration must advance by steps of 974 so that the 
						-- first inner step of the next outer iteration handles position 
						-- j + 974, thereby ensuring that no 27-mer is processed twice and that none is omitted.

		if ((debug_count +:= 1) mod 1000000) = 0 then print(debug_count," ",time()); end if; 

			address := j + k;			-- the character address, on the current chromosome, of the mer being formed

			address_code := "" +/[char(((address/divisor) mod 100) + 33): divisor in divisors];
							-- encode the 12-digit chromosome address as a 6-byte string 
			
							-- encode the 27-base sequnce of bases as a peptide string of length 9
			peptide_code := "" +/ [peptide(chrom_piece(kpo := k + offs..kpo + 2)): offs in [1,4..25]];
			
			if "*" notin peptide_code then 
				peptide_code +:= address_code; 
--print("peptide_code: ",peptide_code," ",ohandles," ",peptide_number(peptide_code(1)));		
				printa(ohandles(peptide_code(1)),peptide_code);
			end if;
			
					-- now we need to generate the corresponding complementary_strand peptide mer.
					-- this will have the nominal chromosome address chrom_size - address (so that 
					-- these addresses increase along the peptide chain) and its peptdes will be
					-- generated in the reverse order using the 'comp_peptide' map.
--if exists offs in [25,22..1] | comp_peptide(chrom_piece(kpo := k + offs..kpo + 2)) = OM then print(k," ",k + offs," ",#chrom_piece," ",chrom_piece(kpo := k + offs..kpo + 2)); end if;
			peptide_code := "" +/ [comp_peptide(chrom_piece(kpo := k + offs..kpo + 2)): offs in [25,22..1]];
			address := chrom_size - address;			-- see comment above

			address_code := "" +/[char((((address + genome_top)/divisor) mod 100) + 33): divisor in divisors];
						-- encode the 12-digit chromosome address as a 6-byte string, offsetting by genome_top
						-- to indicate that this is on the reverse strand
							 
			
			if "*" notin peptide_code then 
				peptide_code +:= address_code;		
				printa(ohandles(peptide_code(1)),peptide_code);
			end if;

		end loop;
		
	end loop;

	print("Done distributing peptide mers constructed for chromosome: ",chrom_number," ",time());
	
end distribute_mers;

procedure prune_duplicate_mers(file_name_list);		-- prune the duplidated mers from the indicated list of files
	-- this processes the ___Genome.a, ___Genome.b,... files, etc. rewriting them in place after removal of all duplicated mers
	
	unique_elements_found := duplicates_removed := 0;
	for file_name in file_name_list loop prune_duplicates_from_one_file(file_name); end loop;

	print("Done removing duplicates: "," ",time()," "," num. unique elements found is: ",unique_elements_found,
					" num. duplicates removed is: ",duplicates_removed);
	
end prune_duplicate_mers;

procedure prune_duplicates_from_one_file(file_name);		-- prune the duplicated mers from one file
	
--	print("Starting to remove duplicates from file: ",file_name," ",time()); 
	debug_count := 0;
	
	junkhandle := open("junk","TEXT-OUT");				-- open and clear the auxiliary file
	sourcehandle := open(file_name,"TEXT-IN");
	
	geta(sourcehandle,x); prev_x := OM;
	
	while x /= OM loop
		
		if ((debug_count +:= 1) mod 10000000) = 0 then print(debug_count," ",time()); end if; 
		if prev_pepseq /= (this_pepseq := x(1..basic_mer_length)) then 
			printa(junkhandle,x); unique_elements_found +:= 1; 
		else
			duplicates_removed +:= 1;			-- note the removal of a duplicate
		end if;
		prev_pepseq := this_pepseq;
		geta(sourcehandle,x); 

	end loop;
	
	close(junkhandle); close(sourcehandle); 			-- now copy the temporary file into the source file
	junkhandle := open("junk","TEXT-IN");				-- open and clear the auxiliary file
	sourcehandle := open(file_name,"TEXT-OUT");
	
	geta(junkhandle,x); prev_x := OM;
	
	while x /= OM loop
		printa(sourcehandle,x); 
		geta(junkhandle,x); 
	end loop;

	close(junkhandle); close(sourcehandle); 			-- now copy the temporary file into the source file
	junkhandle := open("junk","TEXT-OUT"); close(junkhandle); 	-- open and clear the auxiliary file
		
	
end prune_duplicates_from_one_file;

procedure attach_mer_offsets_and_assemble(file_name_listH,file_name_listM);		
		-- this takes the two sets of HumanGenome.a, HumanGenome.b,... and MouseGenome.a, MouseGenome.b,... files,
		-- and works them over together in a merge-like pass, prefixing each human mer for wich there is an identical 
		-- mouse mer by the offset to that Mouse mer. Elements for which there is no corresponding Mouse mer are dropped. 
		-- The file produced is called HumanMouseMerge, and the format of its elements is OOOOOMMMMMMMMMCnnnnn, reflecting
		-- the fact that the 10-digit offset field is represented by 10 half-byte nibbles encoded as describeed above.
	
	print("Starting to merge duplicates from file: ",file_name," ",time()); debug_count := 0;
	common_mers_found := 0;				-- this will be totaled over all the files merged

	ohandle := open(master_folder_prefix + "HumanMouseMerge","TEXT-OUT");
	
	for file_nameH = file_name_listH(j) loop attach_mer_offsets_one_file(file_nameH,file_name_listM(j),ohandle); end loop;
	
	close(ohandle); 	-- release the output file
	print("Done finding common mers; total found is ",common_mers_found," ",time()); 
	
end attach_mer_offsets_and_assemble;

procedure attach_mer_offsets_one_file(file_name_H,file_name_M,ohandle);		-- finds common elements in one pair of mer files
		-- these are written to the ouput file with a 6-character offset attached

	inhandleH := open(file_name_H,"TEXT-IN"); inhandleM := open(file_name_M,"TEXT-IN"); 
	print("Starting to find common elements in files: ",file_name_H," and ",file_name_M," ",time()); debug_count := 0;
	bmlp1 := basic_mer_length + 1;
	
	geta(inhandleH,current_H_item); geta(inhandleM,current_M_item);			-- read the two initial items
	
	while current_H_item /= OM and current_M_item /= OM loop		-- while there are items to be compared
		
		if (chif := current_H_item(1..basic_mer_length)) = (cmif := current_M_item(1..basic_mer_length)) then 
							-- we have a common mer; calculate its H to M offset and write it to the output file
						-- decode the H position
			
			H_item_addstg := current_H_item(bmlp1..); 			-- get the two mer addresses, in coded form 
			M_item_addstg := current_M_item(bmlp1..);
			H_item_addr := M_item_addr := 0;
																-- decode the addresses
			for c in H_item_addstg loop H_item_addr := (H_item_addr * 100) + abs(c) - 33; end loop;
			for c in M_item_addstg loop M_item_addr := (M_item_addr * 100) + abs(c) - 33; end loop;
--print("common element and addresses: ",chif," ",H_item_addr," ",M_item_addr," ",M_item_addr - H_item_addr);						
			
						-- 
			offset := M_item_addr - H_item_addr + twice_genome_max;		-- get the offset, displaced by twice_genome_max to force positive
						-- re-encode the offset as a 6-byte string
			encoded_offset := "" +/[char(((offset/divisor) mod 100) + 33): divisor in divisors]; 
			
			offset_tagged_mer := encoded_offset + current_H_item;				-- attach the offset to the current_H_item
		
			printa(ohandle,offset_tagged_mer);	-- write it to the file of common elements			
			common_mers_found +:= 1;
			geta(inhandleH,current_H_item); 			-- advance the H item, since one must be advanced

		elseif chif < cmif then 		-- advance the H_item

			geta(inhandleH,current_H_item); 

		else 							-- advance the M_item

			geta(inhandleM,current_M_item);	

		end if;
		
	end loop;
	
	close(inhandleH); close(inhandleM); 	-- release the two input files

	print("Done finding common elements in files: ",file_name_H," and ",file_name_M," ",time()); 

end attach_mer_offsets_one_file;

procedure segregate_frequent_offsets(file_name);			-- this takes the HumanMouseMerge file after sorting by offsets,
															-- and scans it for offsets occuring more frequently than the 
															-- repeated_offset_threshold. These offset elements are written to file
															-- OK_HumanMouseMerge, and the other HumanMouseMerge elements are written to
															-- HumanMouseMerge_to_verify	

	ihandle := open(master_folder_prefix + "HumanMouseMerge","TEXT-IN");
	ohandle_OK := open(master_folder_prefix + "OK_HumanMouseMerge","TEXT-OUT");
	ohandle_maybe := open(master_folder_prefix + "HumanMouseMerge_to_verify","TEXT-OUT");
	
	num_OK := num_to_verify := 0;		-- these will be counted in the loop which follows
	
	geta(ihandle,current_tagged_item);			-- these items have the form OOOOOOPPPPPPPPPPPPnnnnnn
--print("ihandle,current_tagged_item: ",ihandle," ",current_tagged_item);
	current_offset := (x := current_tagged_item)(1..6); num_in_group := 0;
items_in_group := [];			-- collect items in group for debugging purposes
	-- these are collected as pairs [genome_address,peptide_9mer]
	
	while current_tagged_item /= OM loop		-- while there are items to be compared
		
		if (xo := current_tagged_item(1..6)) /= current_offset then		-- this group ends
			if num_in_group >= repeated_offset_threshold then		-- this is definitely an interesting offset
				printa(ohandle_OK,current_offset + str(num_in_group));
print("significant offset: ",vco := num_prefix_val(current_offset) - twice_genome_max," ",num_in_group," Hadresses: ",group_by_threes(vco,merge_sort(items_in_group)));
			else			-- we must determine if one of the elements with this offset is verifiably interesting
				printa(ohandle_maybe,current_offset + str(num_in_group));
print("offset to examine: ",num_prefix_val(current_offset) - twice_genome_max," ",num_in_group);
			end if;

items_in_group := []; 			-- collect items in group for debugging purposes
			num_in_group := 0;				-- restart the count for the new group
			current_offset := xo;			-- note the offset for the new group

		end if;
--print("num_in_group: ",num_in_group);		
		num_in_group +:= 1;		-- otherwise just count one more element in the current group
items_in_group with:= [num_prefix_val(current_tagged_item(16..)),current_tagged_item(7..15)]
; 			-- collect renome addresses of items in group and peptide_9mers for debugging purposes
		geta(ihandle,current_tagged_item);			-- read the next line

	end loop;
	
	close(ihandle); close(ohandle_OK); close(ohandle_maybe); 	-- release the three files used

end segregate_frequent_offsets;

procedure offsets_to_M_addresses(offsets_to_verify_file_name,common_mers_file_name);
		-- this converts the prefixed offset values of the elements in HumanMouseMerge_to_verify
		-- into prefixed Mouse chromosome addresses	
		-- it produces a file called HumanMouseMerge_Madresses
	
	ihandle_maybe := open(master_folder_prefix + offsets_to_verify_file_name,"TEXT-IN");
	ihandle_commons := open(master_folder_prefix + common_mers_file_name,"TEXT-IN");
	ohandle := open(master_folder_prefix + "HumanMouseMerge_Madresses","TEXT-OUT");
	
			-- we scan the offsets_to_verify_file and the common_mers_file, both of hich have been sorted into offset order, together
			-- for each element of the common_mers_file whose offset appears in the offsets_to_verify_file, we generate a like element 
			-- in which the offset has been converted to a mouse chromosome address, and write these items to the output file

	close(ihandle_maybe); close(ihandle_commons); close(ohandle); 	-- release the two input files and the output file

end offsets_to_M_addresses;

procedure build_H_sections(common_mers_with_M_addresses);
		-- this adds the actual data to the entries in the HumanMouseMerge_Madresses file	
		-- it produces a file called HumanMouseMerge_verif_secs

end build_H_sections;

procedure verify_H_sections_by_M(file_name,genome_name_M);
		-- the processes the elements of HumanMouseMerge_verif_secs, scanning this file in parallel with the mouse genome
		-- and collecting all sections with a match score aove our required threshold (only the H addresses of the sections are collected)
		-- it produces a file called HumanMouseMerge_verified_Haddr
		-- we sum what is in effect the negative logarithm of the pbobabily  that various features should appear in the 
		-- match of the H genome section passed to thhe corresponding M section. The basic formula proposed is
		-- to sum 

end verify_H_sections_by_M;

procedure collect_verified_common_mers(accepted_offsets_file_name,residual_offsets_file_name,verification_file_name);		

end collect_verified_common_mers;

procedure assemble_likely_exons(accepted_offsets_file_name);		

end assemble_likely_exons;

					-- *********** auxiliary procedures, only for use in testing ***********
procedure group_by_threes(mouse_offset,tup_of_ints_and_peps);					
		-- the elements of the argument tuple have the form [genome_address,peptide_9mer_starting_at_that_address]
--print("mouse_offset: ",mouse_offset);
	tup0 := abbrev_groups(mod0_seq := [x: x in tup_of_ints_and_peps | x(1) mod 3 = 0]);
	tup1 := abbrev_groups(mod1_seq := [x: x in tup_of_ints_and_peps | x(1) mod 3 = 1]);
	tup2 := abbrev_groups(mod2_seq := [x: x in tup_of_ints_and_peps | x(1) mod 3 = 2]);

	tups := [tup0,tup1,tup2];

	maxl := max/(tots := [#mod0_seq,#mod1_seq,#mod2_seq]);
	must := exists tot = tots(j) | maxl = tot;
--if j = OM or not must then print("???? ",must," ",tots," ",maxl," ",j); end if;	

	mouse_on_reverse_strand := mouse_offset + (t11 := tup_of_ints_and_peps(1)(1)) >= genome_top;
	on_reverse_strand := t11 >= genome_top;		-- show the strands with the total

	tots := ["[" + str(tot) + if on_reverse_strand then "-" else "+" end if  
						+ if mouse_on_reverse_strand then "-" else "+" end if + "]": tot in tots];
	tots(j) := tots(j) + tups(j);
	
	return join(tots," or ");

end group_by_threes;

procedure abbrev_groups(tup_of_ints);		-- abbreviate a tuple of genome address integers by finding groups of successive integers in it
		-- the elements of the argument tuple have the form [genome_address,peptide_9mer_starting_at_that_address]
	
	if #tup_of_ints = 0 then return "None"; end if;
	
	if tup_of_ints(1)(1) >= genome_top then 
		tup_of_ints := [[x - genome_top,peps]: [x,peps] in tup_of_ints]; 
	end if;

	tup_of_ints := [[x/3,peps]: [x,peps] in tup_of_ints]; 

	groups := [];
	nti := #tup_of_ints; 
	
	group_start := 1;
	
	while exists j in [group_start + 1..nti] | tup_of_ints(j)(1) /= tup_of_ints(j - 1)(1) + 1 loop 
		groups with:= str(tup_of_ints(group_start)(1)) + ":" +  tup_of_ints(group_start)(2) +"(" + str(j - group_start) + ")"; group_start := j;
	end loop;
	 
	groups with:= str(tup_of_ints(group_start)(1)) + ":"  + tup_of_ints(group_start)(2) +"(" + str(nti - group_start + 1) + ")";		-- capture the last group
	
	return join(groups,",");
	
end abbrev_groups;

procedure sort_by_mers(file_name_list);		-- auxiliary sorting routine for testing: sort items alphabetically by 9-mers

	for file_name in file_name_list loop
		lines := merge_sort(get_lines(file_name));	
		ohandle := open(file_name,"TEXT-OUT");				-- open and clear the file that has just been sorted
		for line in lines loop printa(ohandle,line); end loop;		-- write the sorted data to it
		close(ohandle);		-- release the file
	end loop;
	
end sort_by_mers;

procedure sort_by_offsets(file_name);		-- auxiliary sorting routine for testing: sort items numerically by offset values

	tagged_lines := merge_sort([[num_prefix_val(line(1..6)),line]: line in get_lines(master_folder_prefix + file_name)]);	
	ohandle := open(master_folder_prefix + file_name,"TEXT-OUT");				-- open and clear the file that has just been sorted

--print("num tagged_lines: ",#tagged_lines," ",master_folder_prefix + file_name," ",ohandle);

	for [pref,line] in tagged_lines loop printa(ohandle,line); end loop;		-- write the sorted data to it

	close(ohandle);		-- release the file

end sort_by_offsets;

procedure sort_by_addresses(file_name);		-- auxiliary sorting routine for testing: sort items numerically by offset values

	tagged_lines := merge_sort([[num_prefix_val(line(1..6)),line]: line in get_lines(master_folder_prefix + file_name)]);	
	ohandle := open(master_folder_prefix + file_name,"TEXT-OUT");				-- open and clear the file that has just been sorted
	for [pref,line] in tagged_lines loop printa(ohandle,line); end loop;		-- write the sorted data to it

	close(ohandle);		-- release the file

end sort_by_addresses;

procedure num_prefix_val(stg);		-- expand a coded numerical string into its integer value

	val := 0;
	for c in stg loop val := (val * 100) + abs(c) - 33; end loop;
	return val;
	
end num_prefix_val;

end test;