Ana3.1 產生資料


在此節中實作一個產生資料的程式,其結果可以應用在後續的分群實作中。

安裝模組

使用 Unofficial Windows Binaries for Python Extension Packages 網站中的 windows 安裝檔安裝相關模組。

此資料產生程式的概念流程如下:

# 顯示開始訊息

# 加入2維常態分佈,即其參數(u, sigma)
# 產生2維資料點,即(x, y, label)
# 將資料點存入檔案
# 顯示2維資料點分佈圖

# 顯示結束訊息
  • 參考檔案: 2d-data_generator.py
# coding=utf-8
# 匯入模組
import copy
import matplotlib.pyplot as plt
import random
import sys


# 定義變數
PLOT_DISPLAY_AXIS = []


# 定義函數
def generate_2d_data(num_data, normal_param):
    cluster_index = 1
    data_list = []

    for p in normal_param:
        p1, p2 = p

        for i in range(0, num_data):
            x = random.gauss(p1[0], p1[1])
            y = random.gauss(p2[0], p2[1])
            data_list.append((x, y, cluster_index))

        cluster_index += 1
    return data_list


def plot_display_axis_configuration(data_list):
    global PLOT_DISPLAY_AXIS

    if not PLOT_DISPLAY_AXIS:
        x_max = -sys.maxsize
        y_max = -sys.maxsize
        x_min = sys.maxsize
        y_min = sys.maxsize

        for x, y, c in data_list:
            if x > x_max:
                x_max = copy.copy(x)
            if x < x_min:
                x_min = copy.copy(x)
            if y > y_max:
                y_max = copy.copy(y)
            if y < y_min:
                y_min = copy.copy(y)

        PLOT_DISPLAY_AXIS = [x_min-1, x_max+1, y_min-1, y_max+1]
    else:
        pass

    return PLOT_DISPLAY_AXIS


def save_data_to_file(file_name="2d-data.txt", data_list=None):
    if data_list:
        with open(file_name, "w") as f:
            # for x, y, c in data_list:
            #     f.write("{0},{1},{2}\n".format(x, y, c))
            #
            for d in data_list:
                f.write(",".join(map(str, d)))
                f.write("\n")
    else:
        print("[ERR][Msg] No data to save.")


def visualize_2d_data(file_name="2d-data.png", data_list=None, image_show=True, image_save=True):
    if data_list:
        # plot parameters
        plot_marker_alpha = 0.75
        plot_marker_list = ['o', 'x', 'D', '8', 's', '^', 'v', 'H'] * (len(data_list[0]) - 1)
        plot_marker_size = 25
        plot_x_size = 6
        plot_y_size = 6
        plot_dpi = 300

        # prepare data
        data_dict = {}

        for x, y, c in data_list:
            if c in data_dict:
                data_dict[c]['x'].append(x)
                data_dict[c]['y'].append(y)
            else:
                data_dict[c] = {}
                data_dict[c]['x'] = [x]
                data_dict[c]['y'] = [y]

        # draw data figure
        fig, axes = plt.subplots(figsize=(plot_x_size, plot_y_size), facecolor='w')

        for c in sorted(data_dict):
            x = data_dict[c]['x']
            y = data_dict[c]['y']
            marker_style = plot_marker_list[c-1]
            axes.scatter(x, y, marker=marker_style, s=plot_marker_size, alpha=plot_marker_alpha)       

        # plot setting
        axes.axis(plot_display_axis_configuration(data_list))
        axes.xaxis.set_visible(False)
        axes.yaxis.set_visible(False)

        # save and show figure
        plt.tight_layout()

        if image_save:
            plt.savefig(file_name, dpi=plot_dpi, bbox_inches='tight', pad_inches=0.05)

        if image_show:
            plt.show()

        plt.close(fig)
    else:
        print("[ERR][Msg] No data to draw.")


def two_dimension_data_generator(filename_prefix="2d-data", num_train_data=50, num_test_data=10):
    # create k-clusters data generators
    print(">>> Two dimensions data generator")
    print()

    # add 2d normal distributions
    normal_param = list()
    normal_param.append([(-5, 1), (-5, 1)])
    normal_param.append([(0, 2),  (0, 2)])
    normal_param.append([(5, 2),  (5, 2)])

    # generate dataset
    print("[Msg] Generate 2D training and testing data.")
    train_data = generate_2d_data(num_train_data, normal_param)
    test_data = generate_2d_data(num_test_data,  normal_param)

    # save data to file
    print("[Msg] Save the generated-data to file.")
    save_data_to_file(file_name="{0}-train.txt".format(filename_prefix), data_list=train_data)
    save_data_to_file(file_name="{0}-test.txt".format(filename_prefix),  data_list=test_data)

    # visualize 2d-data
    print("[Msg] Visualize the generated-data.")
    visualize_2d_data(file_name="{0}-train.png".format(filename_prefix), data_list=train_data, image_show=False, image_save=True)
    visualize_2d_data(file_name="{0}-test.png".format(filename_prefix),  data_list=test_data,  image_show=False, image_save=True)

    print()
    print(">>> STOP Two dimensions data generator")


if __name__ == "__main__":
    two_dimension_data_generator(filename_prefix="2d-data", num_train_data=50, num_test_data=10)
  • 參考檔案: 2d-data-train.txt
# 有3個2維常態分佈,每一個分佈產生50個資料點,總共150個資料點
-5.936859667128214,-4.687474648857834,1
-4.421855966971912,-6.578596376510371,1
-3.576573716991786,-3.3683064010729327,1
-4.16682880174316,-5.639787037868259,1
-3.583178023929446,-5.036852091460645,1
-5.364425272017589,-4.405944781481657,1
-5.398419103772747,-5.247071908232659,1
-6.386661172008869,-4.018297819177463,1
-3.4211201992830667,-5.21981113285697,1
-4.716363960821377,-5.354505134133797,1
-4.248143075434192,-6.265795323057008,1
-5.188898820470285,-4.952836004333812,1
-5.934261570875925,-5.560803083946354,1
-6.193965253359128,-4.774502858507573,1
-5.621043926743399,-5.001580868688339,1
-2.4996094415981993,-4.5826678634378,1
-5.895769190545542,-3.0611287378789793,1
-5.02333081804984,-4.513081613741614,1
-4.055225563049044,-5.574696650542014,1
-4.718224406686188,-4.678129339497548,1
-6.673791102208524,-6.491093060340415,1
-6.607338453702324,-5.26268677871846,1
-5.351797568672349,-5.647428121682442,1
-5.196515904760404,-5.712709563434432,1
-4.488310807322167,-5.449127263380302,1
-3.9720879634644612,-5.3708705533454095,1
-6.620342862950516,-4.6368264526896965,1
-5.3997521857026785,-5.798832830605025,1
-4.611576194033135,-5.13551836229383,1
-5.042478241075711,-7.087456440898236,1
-4.173198140904893,-7.121956759519188,1
-5.832083019877212,-4.4174681175972985,1
-5.269648078375164,-5.3024129533602276,1
-4.850217020005996,-6.017023110991571,1
-4.686666127003442,-3.1808059665734048,1
-4.500888092158091,-6.346579687545951,1
-4.2536080081445045,-4.008316399380053,1
-4.734675476048596,-5.696646308156895,1
-4.711986432861449,-4.495949799555453,1
-4.589960141563215,-4.8698685043811,1
-3.5324930956471516,-5.113103259560783,1
-5.277671758642005,-4.918963228467579,1
-4.160210938184801,-5.6909565316014925,1
-5.098848953626168,-5.581116022543412,1
-4.090492417777604,-5.734214699387528,1
-4.454235441316957,-4.812744687873902,1
-4.665336625460384,-6.630754307366127,1
-4.445719434827256,-4.91138107256277,1
-4.696005526498868,-5.9265401360695344,1
-6.439491742486804,-5.548740952013887,1
0.6291197484033274,0.44308913399577865,2
1.4816007579379067,2.4628237612489685,2
-2.110368543466534,4.901832815398117,2
1.2092359268158455,-2.5771211104215213,2
-0.7682339118627867,-1.1426446235089507,2
0.9382070390429161,1.2122233722704285,2
1.3218255912086774,-1.4885522848507788,2
-2.43900346393017,0.4640063018860766,2
-1.4481898432997926,-2.2664459302569453,2
-0.7684192032505172,-2.3209329253285076,2
-3.3845111237388825,2.440425780249234,2
0.9260538962533489,3.30438379265415,2
-0.603329858439318,0.5362838405391656,2
-2.7735422538788024,2.288570332042201,2
-0.3654950003598042,0.2870521885735266,2
0.45637551338865995,-0.7200850485729887,2
-1.7307730288433236,-2.139055944402314,2
2.201119652251603,0.0016943483108358432,2
-4.36274491485056,-2.3854262523819547,2
1.2844651799483178,-4.237684531746343,2
0.37517886503212683,-1.6698715925619054,2
1.442879151253225,-2.029560079597978,2
-0.4393308691222324,0.6602622407251917,2
-3.1484874053834355,0.3025974359703895,2
4.569835105847017,-1.3325499018318803,2
3.058127960460761,-1.5211225890894848,2
2.849080743204125,4.258815689605345,2
-1.35621751546613,-2.5774956391644555,2
-2.4238914555379054,2.2794524910571545,2
1.8864284656425985,-1.4691443241812674,2
-1.9141209255168907,1.806782153723014,2
-4.678570539085601,3.0533735447675427,2
1.2334623519098484,-1.6152224924183323,2
-3.8418067545429673,1.9304663085844256,2
-0.7793385954033581,-1.0033569225675003,2
0.6003878783508978,1.0146715187961366,2
-1.1154552072557777,-0.1923661732013327,2
0.8614071306080272,-2.35065024766088,2
-3.9313851299611806,-1.351972551121476,2
-1.5371933699743408,-2.3006384792790047,2
-2.609406388864568,1.498424350572178,2
0.7189366818747213,1.955972718578612,2
-0.904740688634896,3.238621282257209,2
3.6474534009847033,0.281972624630704,2
-3.5276810735617268,-0.003235120357206731,2
0.38567181097737285,-1.477661334049111,2
0.015389264259229794,1.3022579617463512,2
1.0073053787602086,3.259491240664891,2
1.0273282322290413,-3.426841419226482,2
-0.3610267622600211,-0.5170618654324627,2
3.374573337587148,5.2885262379924285,3
1.9937972095184184,6.230320452863269,3
2.0569894021626802,5.965436412062519,3
6.725766258487863,6.849607999893482,3
3.825800380638579,4.320785284599364,3
4.860404752556266,3.637847295435087,3
5.154001011800958,-0.8147047779048915,3
7.200828825934392,2.312435961188624,3
4.181718328478866,3.31375857706516,3
0.4145609922126958,7.709376078546086,3
2.5760267340837992,5.588521315167497,3
5.80174352057608,7.975960630546668,3
5.435126127420551,4.7967662832772815,3
3.589854962772908,3.9076305979038595,3
5.9206946239470115,5.004234017857484,3
2.4343301851833057,6.651043037802342,3
5.989766384318995,4.685756867952425,3
7.290387117715092,3.0369715070306196,3
8.708699044294807,5.022815759438792,3
4.613673857828949,5.250942105159187,3
6.734268974970625,5.794194416326724,3
4.932965402478696,4.017701368324139,3
5.8705888884746775,5.478183557433612,3
5.051794581614221,7.258382846190605,3
3.1389780314903692,4.203482478938865,3
5.284248492822195,2.7886882653311384,3
5.873609725251368,6.005406257972737,3
8.361174715434485,9.124281940820946,3
6.025145892941708,5.3244312020290705,3
6.921472013702212,7.803082509769744,3
4.733375373259311,6.024506989230847,3
5.391118387593023,3.3931730130601943,3
3.853154747336122,3.8388539420262067,3
6.552133507565144,3.2858079954900483,3
5.588418550911221,4.015780862797407,3
4.970611823336491,2.896813026000977,3
8.70970484193779,4.685399052989462,3
5.964903754182224,5.598257920775154,3
6.194816502014051,5.711091138888581,3
7.981392049859905,6.3944602224887355,3
6.919631421418551,1.4510022198139922,3
4.229552822771621,3.7942801561324013,3
7.275570162023493,1.7887058530720634,3
4.4999228562178395,3.799855737784699,3
5.33309116004421,3.2207943019648253,3
1.993370353284004,5.1055877746719105,3
5.102117123183408,5.514101583723206,3
1.1852562482359592,4.347395971140128,3
5.7196134094182955,6.283146017216334,3
8.167732645504145,5.791043279318752,3
  • 參考檔案: 2d-data-train.png

參考資料

results matching ""

    No results matching ""